From 15bcd46e24aac991dbc9bdd31c8c40970649f0f0 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 16 Oct 2024 17:30:15 +0100
Subject: [PATCH] extended vectis cases and added metrics

---
 etl/epc/Record.py                  |  2 +
 etl/epc/generate_scenarios_data.py | 84 +++++++++++++++++++++++-------
 2 files changed, 67 insertions(+), 19 deletions(-)

diff --git a/etl/epc/Record.py b/etl/epc/Record.py
index cc70d42b..4c1a912b 100644
--- a/etl/epc/Record.py
+++ b/etl/epc/Record.py
@@ -575,6 +575,8 @@ class EPCRecord:
         mains_gas_map = {
             "Y": True,
             "N": False,
+            True: True,
+            False: False
         }
 
         self.prepared_epc["mains-gas-flag"] = (
diff --git a/etl/epc/generate_scenarios_data.py b/etl/epc/generate_scenarios_data.py
index 1714fe39..fe27cc91 100644
--- a/etl/epc/generate_scenarios_data.py
+++ b/etl/epc/generate_scenarios_data.py
@@ -1,5 +1,6 @@
 from datetime import datetime, timezone, date
 import itertools
+from tqdm import tqdm
 
 import pandas as pd
 from etl.epc.Record import EPCRecord
@@ -20,6 +21,7 @@ from backend.Property import Property
 from recommendations.Recommendations import Recommendations
 from utils.logger import setup_logger
 from utils.s3 import read_dataframe_from_s3_parquet, save_dataframe_to_s3_parquet
+from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
 
 now = datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
 
@@ -1165,7 +1167,7 @@ scenario_properties = [
 
 recommendations_scoring_data = []
 
-for scenario_property in scenario_properties:
+for scenario_property in tqdm(scenario_properties):
     # We validate each record in the file. If the record is NOT valid, we need to handle this accordingly
 
     epc_searcher = SearchEpc(
@@ -1174,22 +1176,35 @@ for scenario_property in scenario_properties:
         auth_token=get_settings().EPC_AUTH_TOKEN,
         os_api_key=get_settings().ORDNANCE_SURVEY_API_KEY,
     )
-    epc_searcher.find_property()
 
-    # Find the epc with the same LMK key
-    all_epcs = epc_searcher.older_epcs.copy()
-    all_epcs.extend([epc_searcher.newest_epc, epc_searcher.full_sap_epc])
-    original_epc = [
-        epc
-        for epc in all_epcs
-        if epc.get("lmk-key", None) == scenario_property.get("lmk-key")
-    ][0]
+    if scenario_property["lmk-key"] is None:
+        epc_records = {
+            "original_epc": scenario_property["epc"],
+            "full_sap_epc": {},
+            "old_data": scenario_property["old_epcs"],
+        }
+        address = scenario_property["address"]
+        postcode = scenario_property["postcode"]
+    else:
+        epc_searcher.find_property()
 
-    epc_records = {
-        "original_epc": original_epc,
-        "full_sap_epc": {},
-        "old_data": [],
-    }
+        # Find the epc with the same LMK key
+        all_epcs = epc_searcher.older_epcs.copy()
+        all_epcs.extend([epc_searcher.newest_epc, epc_searcher.full_sap_epc])
+        original_epc = [
+            epc
+            for epc in all_epcs
+            if epc.get("lmk-key", None) == scenario_property.get("lmk-key")
+        ][0]
+
+        epc_records = {
+            "original_epc": original_epc,
+            "full_sap_epc": {},
+            "old_data": [],
+        }
+
+        address = epc_searcher.address_clean
+        postcode = epc_searcher.postcode_clean
 
     prepared_epc = EPCRecord(
         epc_records=epc_records, run_mode="newdata", cleaning_data=cleaning_data
@@ -1197,8 +1212,8 @@ for scenario_property in scenario_properties:
 
     p = Property(
         id=prepared_epc.uprn,
-        address=epc_searcher.address_clean,
-        postcode=epc_searcher.postcode_clean,
+        address=address,
+        postcode=postcode,
         epc_record=prepared_epc,
     )
 
@@ -1220,8 +1235,10 @@ for scenario_property in scenario_properties:
 
     wall_recommendations = recommender.wall_recomender.recommendations
     loft_recommendations = recommender.roof_recommender.recommendations
+    floor_recommendations = recommender.floor_recommender.recommendations
     solar_recommendations = recommender.solar_recommender.recommendation
     windows_recommendations = recommender.windows_recommender.recommendation
+    led_recommendations = recommender.lighting_recommender.recommendation
 
     p.create_base_difference_epc_record(cleaned_lookup=cleaned)
 
@@ -1236,8 +1253,10 @@ for scenario_property in scenario_properties:
 
         wall_recs = []
         loft_recs = []
+        floor_recs = []
         solar_recs = []
         windows_recs = []
+        lighting_recs = []
 
         if "internal_wall_insulation" in measure:
             for rec in wall_recommendations:
@@ -1265,12 +1284,22 @@ for scenario_property in scenario_properties:
                 if rec["type"] == "solar_pv":
                     solar_recs.append(rec)
 
-        if "windows" in measure:
+        if "windows" in measure or "secondary_glazing" in measure or "double_glazing" in measure:
             for rec in windows_recommendations:
                 if rec["type"] == "windows_glazing":
                     windows_recs.append(rec)
 
-        combi_list = [wall_recs, loft_recs, solar_recs, windows_recs]
+        if "low_energy_lighting" in measure:
+            for rec in led_recommendations:
+                if rec["type"] == "led_lighting":
+                    lighting_recs.append(rec)
+
+        if "suspended_floor_insulation" in measure:
+            for rec in floor_recommendations:
+                if rec["type"] == "suspended_floor_insulation":
+                    floor_recs.append(rec)
+
+        combi_list = [wall_recs, loft_recs, solar_recs, windows_recs, lighting_recs, floor_recs]
         combi_list = [element for element in combi_list if len(element) != 0]
 
         all_combi_recommendations = list(itertools.product(*combi_list))
@@ -1331,6 +1360,23 @@ sap_impact = pd.concat(
     axis=1
 )
 sap_impact["predicted_impact"] = sap_impact["predictions"] - sap_impact["sap_starting"]
+sap_impact["actual_post_sap"] = sap_impact["impact"] + sap_impact["sap_starting"]
+sap_impact = sap_impact[
+    [
+        'id', 'property_id', 'recommendation_id', 'phase', 'uprn', 'sap_starting', 'predictions', 'actual_post_sap',
+        'impact', 'predicted_impact'
+    ]
+].rename(
+    columns={"predictions": "predicted_post_sap", "impact": "actual_impact"}
+)
+
+# Get some metrics - MAPE for local testing
+mae = mean_absolute_error(sap_impact["actual_post_sap"], sap_impact["predicted_post_sap"])
+# 2.2958333333333347
+mape = mean_absolute_percentage_error(sap_impact["actual_post_sap"], sap_impact["predicted_post_sap"])
+# 0.034359867214274246
+mape_impact = mean_absolute_percentage_error(sap_impact["actual_impact"], sap_impact["predicted_impact"])
+# 0.4853675375550377
 
 save_dataframe_to_s3_parquet(
     recommendations_scoring_data,