From f9633618b1888be869b160fcb3d991e7a62df2f5 Mon Sep 17 00:00:00 2001
From: Jun-te Kim <junte@domna.homes>
Date: Tue, 22 Apr 2025 16:16:39 +0000
Subject: [PATCH] deeem score improved

---
 etl/age_band_calculator.py                    | 59 +++++++++++++++++++
 etl/hubspot_to_invoice.py                     | 14 ++---
 ...{filechecker.py => imagefilenamechcker.py} |  2 -
 etl/scraper/scraper.py                        |  2 +-
 etl/surveyPrice/surveyPrice.py                | 37 ++++++++----
 5 files changed, 92 insertions(+), 22 deletions(-)
 create mode 100644 etl/age_band_calculator.py
 rename etl/{filechecker.py => imagefilenamechcker.py} (95%)

diff --git a/etl/age_band_calculator.py b/etl/age_band_calculator.py
new file mode 100644
index 0000000..1c09476
--- /dev/null
+++ b/etl/age_band_calculator.py
@@ -0,0 +1,59 @@
+import os
+os.environ["SHAREPOINT_CLIENT_ID"] = "895e3b77-b1d7-43ec-b18f-dcfe07cdfeaf"
+os.environ["SHAREPOINT_CLIENT_SECRET"] = "SOf8Q~-is4wdQiqvEEm9FlJQRAY9ELGaj5Qz-a6E"
+os.environ["SHAREPOINT_TENANT_ID"] = "c3f7519c-2719-4547-af04-6da6cbfd8f8f"
+os.environ["SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID"] = "b5a51507-9427-4ee0-b03e-90ec7681e2d3"
+os.environ["JJC_SERVICE_SHAREPOINT_ID"] = "7fdd0485-bbf3-4b29-b30f-98c81c2a6284"
+from etl.scraper.scraper import SharePointScraper, SharePointInstaller, WEEK_COMMENCING
+import pandas as pd
+from etl.surveyedData.surveryedData import surveyedDataProcessor
+
+import etl.scraper.scraper as scraper_module
+
+def return_pandas_from_scraping(week_commencing, installer):
+    scraper_module.WEEK_COMMENCING = week_commencing
+    sp = SharePointScraper(installer)
+    file_paths = sp.download_file_for_each_address()
+    list_of_surveys = []
+    list_ = []
+    for eachAddress in file_paths:
+        for address, files in eachAddress.items():
+            list_of_surveys.append(surveyedDataProcessor(address, files))
+
+    for survey in list_of_surveys:
+        dict_ = {}
+        if survey.pre_site_note:
+            dict_.update({"address": survey.address})
+            dict_.update({"age_band": survey.pre_site_note.property_description.main_property.age_band})
+            list_.append(dict_)
+
+    if list_:
+        return pd.DataFrame(list_)
+    else:
+        return None
+
+installers = [SharePointInstaller.JJC, SharePointInstaller.SOUTH_COAST_INSULATION]
+dates = [
+    "W.C. 14.04.2025",
+    "W.C. 31.03.2025",
+    "W.C. 24.03.2025",
+    "W.C. 17.03.2025",
+    "W.C. 10.03.2025",
+    "W.C. 03.03.2025",
+    "W.C. 24.02.2025",
+]
+
+all_dfs = []
+
+for installer in installers:
+    for date in dates:
+        df = return_pandas_from_scraping(date, installer)
+        if df is not None:
+            df["installer"] = installer.name
+            df["week_commencing"] = date
+            all_dfs.append(df)
+
+giant_df = pd.concat(all_dfs, ignore_index=True)
+giant_df
+giant_df.to_csv("age_band.csv")
+
diff --git a/etl/hubspot_to_invoice.py b/etl/hubspot_to_invoice.py
index ddbaeff..a106b4b 100644
--- a/etl/hubspot_to_invoice.py
+++ b/etl/hubspot_to_invoice.py
@@ -21,7 +21,7 @@ output_path = os.path.abspath(verbose_file)
 sp.upload_to_sharepoint(output_path, verbose_file)
 
 lewis_view = "FOR_LEWIS.xlsx"
-selected_columns = ["INSTALLER", "HUBSPOT_DEAL_ADDRESS", "PRICE"]
+selected_columns = ["HUBSPOT_INSTALLER", "HUBSPOT_DEAL_ADDRESS", "PRICE"]
 minimal_df = df[selected_columns]
 minimal_df.to_excel(lewis_view, index=False)
 output_path = os.path.abspath(lewis_view)
@@ -58,14 +58,14 @@ sp.move_deals_to_completed(deal_ids)
 
 # SCIS
 # 3 examples of Solar
-# ( not in hubspot ) 12 short hedges - Solar 1608
-# ( not in hubspot ) 18 short hedge - Solar 1608
-# ( not in hubspot) 6 forety road -Solar 1608
+# ( in hubspot ) 12 short hedges - Solar 1608
+# ( in hubspot ) 18 short hedge - Solar 1608
+# ( in hubspot) 6 forety road -Solar 1608
 
 # 3 examples Cavity Wall, FOAM, Empty and General ideally
-# ( not in hubspot ) 319 Muirfield Road, (Empty Cavity) - 1000
-# ( not in hubspot ) 2 queensway, (Fibre) - 500
-# ( not in hubspot )56 Aughton Crescent -(foam) -  To be worked out by Lewis but lets use this as an oppurtunity -
+# ( in hubspot ) 319 Muirfield Road, (Empty Cavity) - 1000
+# (  hubspot ) 2 queensway, (Fibre) - 500
+# ( in hubspot )56 Aughton Crescent -(foam) -  To be worked out by Lewis but lets use this as an oppurtunity -
 
 # Compare value with what I should get and in the deem score. Keep tabs below so I can check easily
 
diff --git a/etl/filechecker.py b/etl/imagefilenamechcker.py
similarity index 95%
rename from etl/filechecker.py
rename to etl/imagefilenamechcker.py
index ce97157..4e32855 100644
--- a/etl/filechecker.py
+++ b/etl/imagefilenamechcker.py
@@ -69,11 +69,9 @@ for files in list_of_pictures:
         if 'file' in file:
             url = file['@microsoft.graph.downloadUrl']
             print(f"Downloading {files}/{file['name']}")
-            sha256 = calculate_sha256(south_coast_scraper.get_file_content(url))
             final_list.append({
                 "Directories": files,
                 "Photo Name": file['name'],
-                "sha256": sha256,
             })
 
 final_df = pd.DataFrame(final_list)
diff --git a/etl/scraper/scraper.py b/etl/scraper/scraper.py
index 9474eff..a379894 100644
--- a/etl/scraper/scraper.py
+++ b/etl/scraper/scraper.py
@@ -14,7 +14,7 @@ from datetime import datetime, timedelta
 def previous_monday():
     today = datetime.today()
     last_monday = today - timedelta(days=today.weekday() + 7)  # Go back to last week's Monday
-    return f"W.C. 31.03.2025"
+    return f"W.C. 31.09.2000"
     # return f"W.C. {last_monday.strftime('%d.%m.%Y')}"
 
 WEEK_COMMENCING = os.getenv("WEEK_COMMENCING", previous_monday())
diff --git a/etl/surveyPrice/surveyPrice.py b/etl/surveyPrice/surveyPrice.py
index cac0060..8fe1f64 100644
--- a/etl/surveyPrice/surveyPrice.py
+++ b/etl/surveyPrice/surveyPrice.py
@@ -151,6 +151,7 @@ class SurveyPrice():
 
     
     def sharepoint_data_for_installer(self, installer):
+
         sp = SharePointScraper(installer)
         file_paths = sp.download_file_for_each_address()
         surveys = []
@@ -173,7 +174,7 @@ class SurveyPrice():
                 "SHAREPOINT FLOOR_AREA_BANDING": "NO PRE SITE NOTES FOUND",
                 "SHAREPOINT PRE_INSTALL_SAP_SCORE": "NO PRE SITE NOTES FOUND",
                 "SHAREPOINT INSULATION MATERIAL": None,
-                "SHAREPOINT ADDRESS": address
+                "SHAREPOINT ADDRESS": surveyInfo.address
             }
 
             if surveyInfo.pre_site_note:
@@ -231,14 +232,24 @@ class SurveyPrice():
             raise RuntimeError("No information found from Hubspot")
 
         # Standardise address
-        self.all_survey_info_from_sharepoint['clean_address'] = self.all_survey_info_from_sharepoint['SHAREPOINT ADDRESS'].apply(
-            lambda x: x.lower().replace(',', '').strip()
-        ) 
+        def extract_start_and_postcode(addr):
+            if not isinstance(addr, str) or addr.strip() == "":
+                return "", ""
+            parts = addr.lower().replace(",", "").strip().split()
+            start = ' '.join(parts[:2])  # Number + street
+            postcode = ' '.join(parts[-2:])  # Postcode
+            return start, postcode
 
-        self.all_hubspot_submissions['clean_address'] = self.all_hubspot_submissions['HUBSPOT_DEAL_ADDRESS'].apply(
-            lambda x: x.lower().replace(',', '').strip()
+        # Extract start + postcode from both datasets
+        self.all_survey_info_from_sharepoint[['address_start', 'postcode']] = self.all_survey_info_from_sharepoint['SHAREPOINT ADDRESS'].apply(
+            lambda x: pd.Series(extract_start_and_postcode(x))
         )
 
+        self.all_hubspot_submissions[['address_start', 'postcode']] = self.all_hubspot_submissions['HUBSPOT_DEAL_ADDRESS'].apply(
+            lambda x: pd.Series(extract_start_and_postcode(x))
+        )
+
+
         # re-name to installer
         self.all_survey_info_from_sharepoint = self.all_survey_info_from_sharepoint.rename(
             columns={
@@ -254,14 +265,14 @@ class SurveyPrice():
         )
 
         merged_df = pd.merge(
-            self.all_survey_info_from_sharepoint, 
-            self.all_hubspot_submissions, 
-            left_on=['clean_address'], 
-            right_on=['clean_address'], 
+            self.all_survey_info_from_sharepoint,
+            self.all_hubspot_submissions,
+            on=['address_start', 'postcode'],
             how='inner'
         )
 
-        merged_df.drop(columns=['clean_address'], inplace=True)
+        merged_df.drop(columns=['address_start', 'postcode'], inplace=True)
+
 
         def compute_energy_grant(row):
             pre_band_letter = row["SHAREPOINT PRE_INSTALL_SAP_SCORE_BANDING"][-1]
@@ -274,12 +285,14 @@ class SurveyPrice():
     
         def work_type(row):
             if row["ENERGY_GRANT"] == "GBIS":
-                return row["ENERGY GRANT"]
+                return "GBIS"
             else:
                 return f"{row["ENERGY_GRANT"]} - SAP {row["SHAREPOINT PRE_INSTALL_SAP_SCORE_BANDING"]} to {row["POST_INSTALL_SAP_SCORE_BANDING"]}"
 
 
         # Add missing variables
+        if merged_df.size == 0:
+            raise RuntimeError("no matched addresses with hubspot and sharepoint pre site notes")
         merged_df["ENERGY_GRANT"] = merged_df.apply(compute_energy_grant, axis=1)
         merged_df["POST_INSTALL_SAP_SCORE_BANDING"] = merged_df.apply(compute_banding_for_post_sap, axis=1)
         merged_df["WORK TYPE"] = merged_df.apply(work_type, axis=1)