From f9633618b1888be869b160fcb3d991e7a62df2f5 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 22 Apr 2025 16:16:39 +0000 Subject: [PATCH] deeem score improved --- etl/age_band_calculator.py | 59 +++++++++++++++++++ etl/hubspot_to_invoice.py | 14 ++--- ...{filechecker.py => imagefilenamechcker.py} | 2 - etl/scraper/scraper.py | 2 +- etl/surveyPrice/surveyPrice.py | 37 ++++++++---- 5 files changed, 92 insertions(+), 22 deletions(-) create mode 100644 etl/age_band_calculator.py rename etl/{filechecker.py => imagefilenamechcker.py} (95%) diff --git a/etl/age_band_calculator.py b/etl/age_band_calculator.py new file mode 100644 index 0000000..1c09476 --- /dev/null +++ b/etl/age_band_calculator.py @@ -0,0 +1,59 @@ +import os +os.environ["SHAREPOINT_CLIENT_ID"] = "895e3b77-b1d7-43ec-b18f-dcfe07cdfeaf" +os.environ["SHAREPOINT_CLIENT_SECRET"] = "SOf8Q~-is4wdQiqvEEm9FlJQRAY9ELGaj5Qz-a6E" +os.environ["SHAREPOINT_TENANT_ID"] = "c3f7519c-2719-4547-af04-6da6cbfd8f8f" +os.environ["SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID"] = "b5a51507-9427-4ee0-b03e-90ec7681e2d3" +os.environ["JJC_SERVICE_SHAREPOINT_ID"] = "7fdd0485-bbf3-4b29-b30f-98c81c2a6284" +from etl.scraper.scraper import SharePointScraper, SharePointInstaller, WEEK_COMMENCING +import pandas as pd +from etl.surveyedData.surveryedData import surveyedDataProcessor + +import etl.scraper.scraper as scraper_module + +def return_pandas_from_scraping(week_commencing, installer): + scraper_module.WEEK_COMMENCING = week_commencing + sp = SharePointScraper(installer) + file_paths = sp.download_file_for_each_address() + list_of_surveys = [] + list_ = [] + for eachAddress in file_paths: + for address, files in eachAddress.items(): + list_of_surveys.append(surveyedDataProcessor(address, files)) + + for survey in list_of_surveys: + dict_ = {} + if survey.pre_site_note: + dict_.update({"address": survey.address}) + dict_.update({"age_band": survey.pre_site_note.property_description.main_property.age_band}) + list_.append(dict_) + + if list_: + return pd.DataFrame(list_) + else: + return None + +installers = [SharePointInstaller.JJC, SharePointInstaller.SOUTH_COAST_INSULATION] +dates = [ + "W.C. 14.04.2025", + "W.C. 31.03.2025", + "W.C. 24.03.2025", + "W.C. 17.03.2025", + "W.C. 10.03.2025", + "W.C. 03.03.2025", + "W.C. 24.02.2025", +] + +all_dfs = [] + +for installer in installers: + for date in dates: + df = return_pandas_from_scraping(date, installer) + if df is not None: + df["installer"] = installer.name + df["week_commencing"] = date + all_dfs.append(df) + +giant_df = pd.concat(all_dfs, ignore_index=True) +giant_df +giant_df.to_csv("age_band.csv") + diff --git a/etl/hubspot_to_invoice.py b/etl/hubspot_to_invoice.py index ddbaeff..a106b4b 100644 --- a/etl/hubspot_to_invoice.py +++ b/etl/hubspot_to_invoice.py @@ -21,7 +21,7 @@ output_path = os.path.abspath(verbose_file) sp.upload_to_sharepoint(output_path, verbose_file) lewis_view = "FOR_LEWIS.xlsx" -selected_columns = ["INSTALLER", "HUBSPOT_DEAL_ADDRESS", "PRICE"] +selected_columns = ["HUBSPOT_INSTALLER", "HUBSPOT_DEAL_ADDRESS", "PRICE"] minimal_df = df[selected_columns] minimal_df.to_excel(lewis_view, index=False) output_path = os.path.abspath(lewis_view) @@ -58,14 +58,14 @@ sp.move_deals_to_completed(deal_ids) # SCIS # 3 examples of Solar -# ( not in hubspot ) 12 short hedges - Solar 1608 -# ( not in hubspot ) 18 short hedge - Solar 1608 -# ( not in hubspot) 6 forety road -Solar 1608 +# ( in hubspot ) 12 short hedges - Solar 1608 +# ( in hubspot ) 18 short hedge - Solar 1608 +# ( in hubspot) 6 forety road -Solar 1608 # 3 examples Cavity Wall, FOAM, Empty and General ideally -# ( not in hubspot ) 319 Muirfield Road, (Empty Cavity) - 1000 -# ( not in hubspot ) 2 queensway, (Fibre) - 500 -# ( not in hubspot )56 Aughton Crescent -(foam) - To be worked out by Lewis but lets use this as an oppurtunity - +# ( in hubspot ) 319 Muirfield Road, (Empty Cavity) - 1000 +# ( hubspot ) 2 queensway, (Fibre) - 500 +# ( in hubspot )56 Aughton Crescent -(foam) - To be worked out by Lewis but lets use this as an oppurtunity - # Compare value with what I should get and in the deem score. Keep tabs below so I can check easily diff --git a/etl/filechecker.py b/etl/imagefilenamechcker.py similarity index 95% rename from etl/filechecker.py rename to etl/imagefilenamechcker.py index ce97157..4e32855 100644 --- a/etl/filechecker.py +++ b/etl/imagefilenamechcker.py @@ -69,11 +69,9 @@ for files in list_of_pictures: if 'file' in file: url = file['@microsoft.graph.downloadUrl'] print(f"Downloading {files}/{file['name']}") - sha256 = calculate_sha256(south_coast_scraper.get_file_content(url)) final_list.append({ "Directories": files, "Photo Name": file['name'], - "sha256": sha256, }) final_df = pd.DataFrame(final_list) diff --git a/etl/scraper/scraper.py b/etl/scraper/scraper.py index 9474eff..a379894 100644 --- a/etl/scraper/scraper.py +++ b/etl/scraper/scraper.py @@ -14,7 +14,7 @@ from datetime import datetime, timedelta def previous_monday(): today = datetime.today() last_monday = today - timedelta(days=today.weekday() + 7) # Go back to last week's Monday - return f"W.C. 31.03.2025" + return f"W.C. 31.09.2000" # return f"W.C. {last_monday.strftime('%d.%m.%Y')}" WEEK_COMMENCING = os.getenv("WEEK_COMMENCING", previous_monday()) diff --git a/etl/surveyPrice/surveyPrice.py b/etl/surveyPrice/surveyPrice.py index cac0060..8fe1f64 100644 --- a/etl/surveyPrice/surveyPrice.py +++ b/etl/surveyPrice/surveyPrice.py @@ -151,6 +151,7 @@ class SurveyPrice(): def sharepoint_data_for_installer(self, installer): + sp = SharePointScraper(installer) file_paths = sp.download_file_for_each_address() surveys = [] @@ -173,7 +174,7 @@ class SurveyPrice(): "SHAREPOINT FLOOR_AREA_BANDING": "NO PRE SITE NOTES FOUND", "SHAREPOINT PRE_INSTALL_SAP_SCORE": "NO PRE SITE NOTES FOUND", "SHAREPOINT INSULATION MATERIAL": None, - "SHAREPOINT ADDRESS": address + "SHAREPOINT ADDRESS": surveyInfo.address } if surveyInfo.pre_site_note: @@ -231,14 +232,24 @@ class SurveyPrice(): raise RuntimeError("No information found from Hubspot") # Standardise address - self.all_survey_info_from_sharepoint['clean_address'] = self.all_survey_info_from_sharepoint['SHAREPOINT ADDRESS'].apply( - lambda x: x.lower().replace(',', '').strip() - ) + def extract_start_and_postcode(addr): + if not isinstance(addr, str) or addr.strip() == "": + return "", "" + parts = addr.lower().replace(",", "").strip().split() + start = ' '.join(parts[:2]) # Number + street + postcode = ' '.join(parts[-2:]) # Postcode + return start, postcode - self.all_hubspot_submissions['clean_address'] = self.all_hubspot_submissions['HUBSPOT_DEAL_ADDRESS'].apply( - lambda x: x.lower().replace(',', '').strip() + # Extract start + postcode from both datasets + self.all_survey_info_from_sharepoint[['address_start', 'postcode']] = self.all_survey_info_from_sharepoint['SHAREPOINT ADDRESS'].apply( + lambda x: pd.Series(extract_start_and_postcode(x)) ) + self.all_hubspot_submissions[['address_start', 'postcode']] = self.all_hubspot_submissions['HUBSPOT_DEAL_ADDRESS'].apply( + lambda x: pd.Series(extract_start_and_postcode(x)) + ) + + # re-name to installer self.all_survey_info_from_sharepoint = self.all_survey_info_from_sharepoint.rename( columns={ @@ -254,14 +265,14 @@ class SurveyPrice(): ) merged_df = pd.merge( - self.all_survey_info_from_sharepoint, - self.all_hubspot_submissions, - left_on=['clean_address'], - right_on=['clean_address'], + self.all_survey_info_from_sharepoint, + self.all_hubspot_submissions, + on=['address_start', 'postcode'], how='inner' ) - merged_df.drop(columns=['clean_address'], inplace=True) + merged_df.drop(columns=['address_start', 'postcode'], inplace=True) + def compute_energy_grant(row): pre_band_letter = row["SHAREPOINT PRE_INSTALL_SAP_SCORE_BANDING"][-1] @@ -274,12 +285,14 @@ class SurveyPrice(): def work_type(row): if row["ENERGY_GRANT"] == "GBIS": - return row["ENERGY GRANT"] + return "GBIS" else: return f"{row["ENERGY_GRANT"]} - SAP {row["SHAREPOINT PRE_INSTALL_SAP_SCORE_BANDING"]} to {row["POST_INSTALL_SAP_SCORE_BANDING"]}" # Add missing variables + if merged_df.size == 0: + raise RuntimeError("no matched addresses with hubspot and sharepoint pre site notes") merged_df["ENERGY_GRANT"] = merged_df.apply(compute_energy_grant, axis=1) merged_df["POST_INSTALL_SAP_SCORE_BANDING"] = merged_df.apply(compute_banding_for_post_sap, axis=1) merged_df["WORK TYPE"] = merged_df.apply(work_type, axis=1)