From f0c4ca0143ee886ba84960b00e3f2700b6047429 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Wed, 10 Apr 2024 11:14:33 +0100
Subject: [PATCH] completed unitas

---
 .../ha_15_32/ha_analysis_batch_3.py           | 46 ++++++++++++++++++-
 1 file changed, 45 insertions(+), 1 deletion(-)

diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index 35bb63fe..f99c7b1a 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -7106,9 +7106,53 @@ def unitas_data_prep(loader):
             continue
 
         raise Exception("something went wrong")
+    outcome_matching = pd.DataFrame(outcome_matching)
+
+    # We can have duplicate matches, so we format the Date letter sent column and retrieve the newest outcome
+    outcome_matching["Date letters sent"] = outcome_matching["Date letters sent"].str.lower()
+    outcome_matching["Extracted Date"] = outcome_matching["Date letters sent"].str.extract(
+        r'(?:w[./]c )(\d{2}\.\d{2}\.\d{4})')
+    outcome_matching["Extracted Date"] = pd.to_datetime(outcome_matching["Extracted Date"], format='%d.%m.%Y')
+    # We sort by asset_list_row_id and extracted date, and retrieve the newest
+    outcome_matching = outcome_matching.sort_values(["asset_list_row_id", "Extracted Date"], ascending=[True, False])
+
+    # Some properties will have multiple outcomes - for these, we re-format
+    outcome_matching_grouped = []
+    for asset_list_row_id, grouped_data in outcome_matching.groupby("asset_list_row_id"):
+        if grouped_data.shape[0] == 1:
+            outcome_matching_grouped.append(
+                {
+                    "Number of previous visits": 1,
+                    **grouped_data.to_dict("records")[0]
+                }
+            )
+            continue
+        if grouped_data.shape[0] == 2:
+            newest_visit = grouped_data.head(1)
+            oldest_visit = grouped_data.tail(1)[['Outcomes', 'Surveyor', 'Notes', 'Date letters sent']].add_suffix(
+                " second visit")
+            to_append = {
+                "Number of previous visits": 2,
+                **newest_visit.to_dict("records")[0],
+                **oldest_visit.to_dict("records")[0]
+            }
+            outcome_matching_grouped.append(to_append)
+        else:
+            raise Exception("something went wrong")
+
+    outcome_matching_grouped = pd.DataFrame(outcome_matching_grouped)
+
+    unitas_properties_to_survey_with_outcomes = unitas_properties_to_survey_full.merge(
+        outcome_matching_grouped, how="left", on="asset_list_row_id"
+    )
+    unitas_properties_to_survey_with_outcomes["Number of previous visits"] = (
+        unitas_properties_to_survey_with_outcomes["Number of previous visits"].fillna(0)
+    )
 
     # Store as an excel
-    unitas_properties_to_survey_full.to_excel("Unitas - phase 2 properties to Survey.xlsx")
+    unitas_properties_to_survey_with_outcomes.to_excel("Unitas - phase 2 properties to Survey.xlsx")
+
+    unitas_properties_to_survey_with_outcomes["Last EPC Built Form"].value_counts()
 
 
 def app():