deeem score improved

2026-06-08 11:17:29 +00:00 · 2025-04-22 16:16:39 +00:00 · 2025-04-22 16:16:39 +00:00 · f9633618b1
commit f9633618b1
parent aca37ea10d
5 changed files with 92 additions and 22 deletions
--- a/etl/age_band_calculator.py
+++ b/etl/age_band_calculator.py
@ -0,0 +1,59 @@
+import os
+os.environ["SHAREPOINT_CLIENT_ID"] = "895e3b77-b1d7-43ec-b18f-dcfe07cdfeaf"
+os.environ["SHAREPOINT_CLIENT_SECRET"] = "SOf8Q~-is4wdQiqvEEm9FlJQRAY9ELGaj5Qz-a6E"
+os.environ["SHAREPOINT_TENANT_ID"] = "c3f7519c-2719-4547-af04-6da6cbfd8f8f"
+os.environ["SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID"] = "b5a51507-9427-4ee0-b03e-90ec7681e2d3"
+os.environ["JJC_SERVICE_SHAREPOINT_ID"] = "7fdd0485-bbf3-4b29-b30f-98c81c2a6284"
+from etl.scraper.scraper import SharePointScraper, SharePointInstaller, WEEK_COMMENCING
+import pandas as pd
+from etl.surveyedData.surveryedData import surveyedDataProcessor
+
+import etl.scraper.scraper as scraper_module
+
+def return_pandas_from_scraping(week_commencing, installer):
+    scraper_module.WEEK_COMMENCING = week_commencing
+    sp = SharePointScraper(installer)
+    file_paths = sp.download_file_for_each_address()
+    list_of_surveys = []
+    list_ = []
+    for eachAddress in file_paths:
+        for address, files in eachAddress.items():
+            list_of_surveys.append(surveyedDataProcessor(address, files))
+
+    for survey in list_of_surveys:
+        dict_ = {}
+        if survey.pre_site_note:
+            dict_.update({"address": survey.address})
+            dict_.update({"age_band": survey.pre_site_note.property_description.main_property.age_band})
+            list_.append(dict_)
+
+    if list_:
+        return pd.DataFrame(list_)
+    else:
+        return None
+
+installers = [SharePointInstaller.JJC, SharePointInstaller.SOUTH_COAST_INSULATION]
+dates = [
+    "W.C. 14.04.2025",
+    "W.C. 31.03.2025",
+    "W.C. 24.03.2025",
+    "W.C. 17.03.2025",
+    "W.C. 10.03.2025",
+    "W.C. 03.03.2025",
+    "W.C. 24.02.2025",
+]
+
+all_dfs = []
+
+for installer in installers:
+    for date in dates:
+        df = return_pandas_from_scraping(date, installer)
+        if df is not None:
+            df["installer"] = installer.name
+            df["week_commencing"] = date
+            all_dfs.append(df)
+
+giant_df = pd.concat(all_dfs, ignore_index=True)
+giant_df
+giant_df.to_csv("age_band.csv")
+
--- a/etl/hubspot_to_invoice.py
+++ b/etl/hubspot_to_invoice.py
@ -21,7 +21,7 @@ output_path = os.path.abspath(verbose_file)
 sp.upload_to_sharepoint(output_path, verbose_file)

 lewis_view = "FOR_LEWIS.xlsx"
-selected_columns = ["INSTALLER", "HUBSPOT_DEAL_ADDRESS", "PRICE"]
+selected_columns = ["HUBSPOT_INSTALLER", "HUBSPOT_DEAL_ADDRESS", "PRICE"]
 minimal_df = df[selected_columns]
 minimal_df.to_excel(lewis_view, index=False)
 output_path = os.path.abspath(lewis_view)
@ -58,14 +58,14 @@ sp.move_deals_to_completed(deal_ids)

 # SCIS
 # 3 examples of Solar
-# ( not in hubspot ) 12 short hedges - Solar 1608
-# ( not in hubspot ) 18 short hedge - Solar 1608
-# ( not in hubspot) 6 forety road -Solar 1608
+# ( in hubspot ) 12 short hedges - Solar 1608
+# ( in hubspot ) 18 short hedge - Solar 1608
+# ( in hubspot) 6 forety road -Solar 1608

 # 3 examples Cavity Wall, FOAM, Empty and General ideally
-# ( not in hubspot ) 319 Muirfield Road, (Empty Cavity) - 1000
-# ( not in hubspot ) 2 queensway, (Fibre) - 500
-# ( not in hubspot )56 Aughton Crescent -(foam) -  To be worked out by Lewis but lets use this as an oppurtunity -
+# ( in hubspot ) 319 Muirfield Road, (Empty Cavity) - 1000
+# (  hubspot ) 2 queensway, (Fibre) - 500
+# ( in hubspot )56 Aughton Crescent -(foam) -  To be worked out by Lewis but lets use this as an oppurtunity -

 # Compare value with what I should get and in the deem score. Keep tabs below so I can check easily

--- a/etl/imagefilenamechcker.py
+++ b/etl/imagefilenamechcker.py
@ -69,11 +69,9 @@ for files in list_of_pictures:
        if 'file' in file:
            url = file['@microsoft.graph.downloadUrl']
            print(f"Downloading {files}/{file['name']}")
-            sha256 = calculate_sha256(south_coast_scraper.get_file_content(url))
            final_list.append({
                "Directories": files,
                "Photo Name": file['name'],
-                "sha256": sha256,
            })

 final_df = pd.DataFrame(final_list)
--- a/etl/scraper/scraper.py
+++ b/etl/scraper/scraper.py
@ -14,7 +14,7 @@ from datetime import datetime, timedelta
 def previous_monday():
    today = datetime.today()
    last_monday = today - timedelta(days=today.weekday() + 7)  # Go back to last week's Monday
-    return f"W.C. 31.03.2025"
+    return f"W.C. 31.09.2000"
    # return f"W.C. {last_monday.strftime('%d.%m.%Y')}"

 WEEK_COMMENCING = os.getenv("WEEK_COMMENCING", previous_monday())
--- a/etl/surveyPrice/surveyPrice.py
+++ b/etl/surveyPrice/surveyPrice.py
@ -151,6 +151,7 @@ class SurveyPrice():

    
    def sharepoint_data_for_installer(self, installer):
+
        sp = SharePointScraper(installer)
        file_paths = sp.download_file_for_each_address()
        surveys = []
@ -173,7 +174,7 @@ class SurveyPrice():
                "SHAREPOINT FLOOR_AREA_BANDING": "NO PRE SITE NOTES FOUND",
                "SHAREPOINT PRE_INSTALL_SAP_SCORE": "NO PRE SITE NOTES FOUND",
                "SHAREPOINT INSULATION MATERIAL": None,
-                "SHAREPOINT ADDRESS": address
+                "SHAREPOINT ADDRESS": surveyInfo.address
            }

            if surveyInfo.pre_site_note:
@ -231,14 +232,24 @@ class SurveyPrice():
            raise RuntimeError("No information found from Hubspot")

        # Standardise address
-        self.all_survey_info_from_sharepoint['clean_address'] = self.all_survey_info_from_sharepoint['SHAREPOINT ADDRESS'].apply(
-            lambda x: x.lower().replace(',', '').strip()
-        ) 
+        def extract_start_and_postcode(addr):
+            if not isinstance(addr, str) or addr.strip() == "":
+                return "", ""
+            parts = addr.lower().replace(",", "").strip().split()
+            start = ' '.join(parts[:2])  # Number + street
+            postcode = ' '.join(parts[-2:])  # Postcode
+            return start, postcode

-        self.all_hubspot_submissions['clean_address'] = self.all_hubspot_submissions['HUBSPOT_DEAL_ADDRESS'].apply(
-            lambda x: x.lower().replace(',', '').strip()
+        # Extract start + postcode from both datasets
+        self.all_survey_info_from_sharepoint[['address_start', 'postcode']] = self.all_survey_info_from_sharepoint['SHAREPOINT ADDRESS'].apply(
+            lambda x: pd.Series(extract_start_and_postcode(x))
        )

+        self.all_hubspot_submissions[['address_start', 'postcode']] = self.all_hubspot_submissions['HUBSPOT_DEAL_ADDRESS'].apply(
+            lambda x: pd.Series(extract_start_and_postcode(x))
+        )
+
+
        # re-name to installer
        self.all_survey_info_from_sharepoint = self.all_survey_info_from_sharepoint.rename(
            columns={
@ -254,14 +265,14 @@ class SurveyPrice():
        )

        merged_df = pd.merge(
-            self.all_survey_info_from_sharepoint, 
-            self.all_hubspot_submissions, 
-            left_on=['clean_address'], 
-            right_on=['clean_address'], 
+            self.all_survey_info_from_sharepoint,
+            self.all_hubspot_submissions,
+            on=['address_start', 'postcode'],
            how='inner'
        )

-        merged_df.drop(columns=['clean_address'], inplace=True)
+        merged_df.drop(columns=['address_start', 'postcode'], inplace=True)
+

        def compute_energy_grant(row):
            pre_band_letter = row["SHAREPOINT PRE_INSTALL_SAP_SCORE_BANDING"][-1]
@ -274,12 +285,14 @@ class SurveyPrice():
    
        def work_type(row):
            if row["ENERGY_GRANT"] == "GBIS":
-                return row["ENERGY GRANT"]
+                return "GBIS"
            else:
                return f"{row["ENERGY_GRANT"]} - SAP {row["SHAREPOINT PRE_INSTALL_SAP_SCORE_BANDING"]} to {row["POST_INSTALL_SAP_SCORE_BANDING"]}"


        # Add missing variables
+        if merged_df.size == 0:
+            raise RuntimeError("no matched addresses with hubspot and sharepoint pre site notes")
        merged_df["ENERGY_GRANT"] = merged_df.apply(compute_energy_grant, axis=1)
        merged_df["POST_INSTALL_SAP_SCORE_BANDING"] = merged_df.apply(compute_banding_for_post_sap, axis=1)
        merged_df["WORK TYPE"] = merged_df.apply(work_type, axis=1)