From 8b48dbac9e5e9f25e3c738c1322b1f3a9fbb11db Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 26 Feb 2024 13:37:50 +0000
Subject: [PATCH] working on eco eligibility code

---
 .../ha_15_32/ha_analysis_batch_3.py           | 153 ++++++++++++++----
 1 file changed, 122 insertions(+), 31 deletions(-)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index dbe12e92..fdc00876 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -725,6 +725,13 @@ class DataLoader:
 
     def load(self):
 
+        # Get the december figures, which is just a csv
+        self.december_figures = pd.read_csv(self.december_figures_filepath)
+        # Remove the spaces in HA Name
+        self.december_figures["HA Name"] = self.december_figures["HA Name"].str.replace(" ", "")
+        self.december_figures["ECO4"] = self.december_figures["ECO4"].astype("Int64")
+        self.december_figures["GBIS"] = self.december_figures["GBIS"].astype("Int64")
+
         if self.use_cache:
             self.data = read_pickle_from_s3(
                 bucket_name="retrofit-datalake-dev",
@@ -732,11 +739,6 @@ class DataLoader:
             )
             return
 
-        # Get the december figures, which is just a csv
-        self.december_figures = pd.read_csv(self.december_figures_filepath)
-        # Remove the spaces in HA Name
-        self.december_figures["HA Name"] = december_figures["HA Name"].str.replace(" ", "")
-
         data = {}
         for filepath in self.directories:
             ha_name = filepath.split("/")[2]
@@ -768,46 +770,135 @@ class DataLoader:
         This function will return a dictionary of facts and figures for each HA
         :return:
         """
+
+        scheme_map = {
+            "ECO4": "ECO4",
+            "AFFORDABLE WARMTH": "ECO4",
+        }
+
+        eco_eligibility_map = {
+            "not eligble": "not eligible"
+        }
+
         ha_facts_and_figures = []
         for ha_name, data_assets in self.data.items():
             asset_list = data_assets["asset_list"].copy()
             survey_list = data_assets["survey_list"].copy()
             ciga_list = data_assets["ciga_list"].copy()
 
-            asset_list["ECO Eligibility"].value_counts()
+            # Change the column name if it's ECO eligibility
+            asset_list = asset_list.rename(columns={"ECO eligibility": "ECO Eligibility"})
+            # Remove surplus whitespace from the ECO Eligibility column
+            asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].str.strip()
+            # Push to lower case
+            asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].str.lower()
+            # Remap
+            asset_list["ECO Eligibility"] = asset_list["ECO Eligibility"].map(eco_eligibility_map)
 
-            # We merge on ciga and update the status to reflect if it has failed ciga or not
-            # If Guarantee is Yes, this means that there is a guarantee in place, and the property failed the CIGA
-            # check
-            asset_list = asset_list.merge(
-                ciga_list[["asset_list_row_id", "Guarantee"]],
-                how='left',
-                on="asset_list_row_id"
-            )
+            if not ciga_list.empty:
+                # We merge on ciga and update the status to reflect if it has failed ciga or not
+                # If Guarantee is Yes, this means that there is a guarantee in place, and the property failed the CIGA
+                # check
+                asset_list = asset_list.merge(
+                    ciga_list[["asset_list_row_id", "Guarantee"]],
+                    how='left',
+                    on="asset_list_row_id"
+                )
 
-            asset_list["ECO Eligibility"].value_counts()
+                asset_list["ECO Eligibility"].value_counts()
 
-            asset_list["ECO Eligibility"] = np.where(
-                (
-                    asset_list["ECO Eligibility"].str.contains("(Subject to CIGA)", regex=False) &
-                    (asset_list["Guarantee"] == "Yes")
-                ),
-                "Failed CIGA",
-                asset_list["ECO Eligibility"]
-            )
+                asset_list["ECO Eligibility"] = np.where(
+                    (
+                        asset_list["ECO Eligibility"].str.contains("(subject to ciga)", regex=False) &
+                        (asset_list["Guarantee"] == "Yes")
+                    ),
+                    "failed ciga",
+                    asset_list["ECO Eligibility"]
+                )
 
-            # We replace any remaining "Subject to CIGA" with pass Ciga
-            asset_list["ECO Eligibility"] = np.where(
-                asset_list["ECO Eligibility"].str.contains("Subject to CIGA", regex=False),
-                "Pass CIGA",
-                asset_list["ECO Eligibility"]
-            )
+                # We replace any remaining "Subject to CIGA" with pass Ciga
+                asset_list["ECO Eligibility"] = np.where(
+                    asset_list["ECO Eligibility"].str.contains("Subject to CIGA", regex=False),
+                    "eco4 - passed ciga",
+                    asset_list["ECO Eligibility"]
+                )
 
-            asset_list = asset_list.drop(columns=["Guarantee"])
+                asset_list = asset_list.drop(columns=["Guarantee"])
 
-            # Update the asset list with the categorisations
+            # Update the asset list with the categorisations and rename changes
             self.data[ha_name]["asset_list"] = asset_list
 
+            # Report on sales
+            sales_report = {}
+            if not survey_list.empty:
+                scheme_column = survey_list.columns[0]
+                # We clean up the survey list installation or cancelled
+                survey_list["installed_or_cancelled_clean"] = survey_list["INSTALLED OR CANCELLED"].str.lower()
+                # Remove all punctuation
+                survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace(
+                    r'[^\w\s]', '', regex=True
+                )
+                # Remove double spaces
+                survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.replace(
+                    r'\s+', ' ', regex=True
+                )
+                # Remove trailing spaces
+                survey_list["installed_or_cancelled_clean"] = survey_list["installed_or_cancelled_clean"].str.strip()
+
+                # Remap the values in the scheme column
+                survey_list[scheme_column] = survey_list[scheme_column].map(scheme_map)
+
+                survey_list["installation_status"] = None
+                survey_list["installation_status"] = np.where(
+                    survey_list["installed_or_cancelled_clean"].isin(["installed", "installed see notes"]),
+                    "installed",
+                    survey_list["installation_status"]
+                )
+                survey_list["installation_status"] = np.where(
+                    survey_list["installed_or_cancelled_clean"].isin(["cancelled"]),
+                    "cancelled",
+                    survey_list["installation_status"]
+                )
+                # Find partial installations
+                survey_list["installation_status"] = np.where(
+                    survey_list["installed_or_cancelled_clean"].str.contains("still to be installed"),
+                    "partially installed",
+                    survey_list["installation_status"]
+                )
+                # Find partial cancellations
+                # TODO: We might have more indications of partial cancellations
+                survey_list["installation_status"] = np.where(
+                    survey_list["installed_or_cancelled_clean"].isin(["loft cancelled"]),
+                    "partially cancelled",
+                    survey_list["installation_status"]
+                )
+
+                # Finally, for other cases, we set the status to "in progress"
+                survey_list["installation_status"] = survey_list["installation_status"].fillna("in progress")
+
+                # We concatenate the scheme name with the installation status
+                survey_list["installation_status"] = (
+                    survey_list[scheme_column] + " - " + survey_list["installation_status"]
+                )
+
+                # We get the sales
+                sales_report = survey_list["installation_status"].value_counts().to_dict()
+
+            ha_facts_and_figures.append(
+                {
+                    "HA Name": ha_name,
+                    **asset_list["ECO Eligibility"].value_counts().to_dict(),
+                    **sales_report
+                }
+            )
+
+        ha_facts_and_figures = pd.DataFrame(ha_facts_and_figures)
+        ha_facts_and_figures = ha_facts_and_figures.drop(
+            columns=["not eligible"]
+        )
+
+        ha_facts_and_figures = self.december_figures.merge(ha_facts_and_figures, how="inner", on="HA Name")
+
         return ha_facts_and_figures