mirror of
https://github.com/Hestia-Homes/survey-extraction.git
synced 2026-06-08 11:17:29 +00:00
deeem score improved
This commit is contained in:
parent
aca37ea10d
commit
f9633618b1
5 changed files with 92 additions and 22 deletions
59
etl/age_band_calculator.py
Normal file
59
etl/age_band_calculator.py
Normal file
|
|
@ -0,0 +1,59 @@
|
|||
import os
|
||||
os.environ["SHAREPOINT_CLIENT_ID"] = "895e3b77-b1d7-43ec-b18f-dcfe07cdfeaf"
|
||||
os.environ["SHAREPOINT_CLIENT_SECRET"] = "SOf8Q~-is4wdQiqvEEm9FlJQRAY9ELGaj5Qz-a6E"
|
||||
os.environ["SHAREPOINT_TENANT_ID"] = "c3f7519c-2719-4547-af04-6da6cbfd8f8f"
|
||||
os.environ["SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID"] = "b5a51507-9427-4ee0-b03e-90ec7681e2d3"
|
||||
os.environ["JJC_SERVICE_SHAREPOINT_ID"] = "7fdd0485-bbf3-4b29-b30f-98c81c2a6284"
|
||||
from etl.scraper.scraper import SharePointScraper, SharePointInstaller, WEEK_COMMENCING
|
||||
import pandas as pd
|
||||
from etl.surveyedData.surveryedData import surveyedDataProcessor
|
||||
|
||||
import etl.scraper.scraper as scraper_module
|
||||
|
||||
def return_pandas_from_scraping(week_commencing, installer):
|
||||
scraper_module.WEEK_COMMENCING = week_commencing
|
||||
sp = SharePointScraper(installer)
|
||||
file_paths = sp.download_file_for_each_address()
|
||||
list_of_surveys = []
|
||||
list_ = []
|
||||
for eachAddress in file_paths:
|
||||
for address, files in eachAddress.items():
|
||||
list_of_surveys.append(surveyedDataProcessor(address, files))
|
||||
|
||||
for survey in list_of_surveys:
|
||||
dict_ = {}
|
||||
if survey.pre_site_note:
|
||||
dict_.update({"address": survey.address})
|
||||
dict_.update({"age_band": survey.pre_site_note.property_description.main_property.age_band})
|
||||
list_.append(dict_)
|
||||
|
||||
if list_:
|
||||
return pd.DataFrame(list_)
|
||||
else:
|
||||
return None
|
||||
|
||||
installers = [SharePointInstaller.JJC, SharePointInstaller.SOUTH_COAST_INSULATION]
|
||||
dates = [
|
||||
"W.C. 14.04.2025",
|
||||
"W.C. 31.03.2025",
|
||||
"W.C. 24.03.2025",
|
||||
"W.C. 17.03.2025",
|
||||
"W.C. 10.03.2025",
|
||||
"W.C. 03.03.2025",
|
||||
"W.C. 24.02.2025",
|
||||
]
|
||||
|
||||
all_dfs = []
|
||||
|
||||
for installer in installers:
|
||||
for date in dates:
|
||||
df = return_pandas_from_scraping(date, installer)
|
||||
if df is not None:
|
||||
df["installer"] = installer.name
|
||||
df["week_commencing"] = date
|
||||
all_dfs.append(df)
|
||||
|
||||
giant_df = pd.concat(all_dfs, ignore_index=True)
|
||||
giant_df
|
||||
giant_df.to_csv("age_band.csv")
|
||||
|
||||
|
|
@ -21,7 +21,7 @@ output_path = os.path.abspath(verbose_file)
|
|||
sp.upload_to_sharepoint(output_path, verbose_file)
|
||||
|
||||
lewis_view = "FOR_LEWIS.xlsx"
|
||||
selected_columns = ["INSTALLER", "HUBSPOT_DEAL_ADDRESS", "PRICE"]
|
||||
selected_columns = ["HUBSPOT_INSTALLER", "HUBSPOT_DEAL_ADDRESS", "PRICE"]
|
||||
minimal_df = df[selected_columns]
|
||||
minimal_df.to_excel(lewis_view, index=False)
|
||||
output_path = os.path.abspath(lewis_view)
|
||||
|
|
@ -58,14 +58,14 @@ sp.move_deals_to_completed(deal_ids)
|
|||
|
||||
# SCIS
|
||||
# 3 examples of Solar
|
||||
# ( not in hubspot ) 12 short hedges - Solar 1608
|
||||
# ( not in hubspot ) 18 short hedge - Solar 1608
|
||||
# ( not in hubspot) 6 forety road -Solar 1608
|
||||
# ( in hubspot ) 12 short hedges - Solar 1608
|
||||
# ( in hubspot ) 18 short hedge - Solar 1608
|
||||
# ( in hubspot) 6 forety road -Solar 1608
|
||||
|
||||
# 3 examples Cavity Wall, FOAM, Empty and General ideally
|
||||
# ( not in hubspot ) 319 Muirfield Road, (Empty Cavity) - 1000
|
||||
# ( not in hubspot ) 2 queensway, (Fibre) - 500
|
||||
# ( not in hubspot )56 Aughton Crescent -(foam) - To be worked out by Lewis but lets use this as an oppurtunity -
|
||||
# ( in hubspot ) 319 Muirfield Road, (Empty Cavity) - 1000
|
||||
# ( hubspot ) 2 queensway, (Fibre) - 500
|
||||
# ( in hubspot )56 Aughton Crescent -(foam) - To be worked out by Lewis but lets use this as an oppurtunity -
|
||||
|
||||
# Compare value with what I should get and in the deem score. Keep tabs below so I can check easily
|
||||
|
||||
|
|
|
|||
|
|
@ -69,11 +69,9 @@ for files in list_of_pictures:
|
|||
if 'file' in file:
|
||||
url = file['@microsoft.graph.downloadUrl']
|
||||
print(f"Downloading {files}/{file['name']}")
|
||||
sha256 = calculate_sha256(south_coast_scraper.get_file_content(url))
|
||||
final_list.append({
|
||||
"Directories": files,
|
||||
"Photo Name": file['name'],
|
||||
"sha256": sha256,
|
||||
})
|
||||
|
||||
final_df = pd.DataFrame(final_list)
|
||||
|
|
@ -14,7 +14,7 @@ from datetime import datetime, timedelta
|
|||
def previous_monday():
|
||||
today = datetime.today()
|
||||
last_monday = today - timedelta(days=today.weekday() + 7) # Go back to last week's Monday
|
||||
return f"W.C. 31.03.2025"
|
||||
return f"W.C. 31.09.2000"
|
||||
# return f"W.C. {last_monday.strftime('%d.%m.%Y')}"
|
||||
|
||||
WEEK_COMMENCING = os.getenv("WEEK_COMMENCING", previous_monday())
|
||||
|
|
|
|||
|
|
@ -151,6 +151,7 @@ class SurveyPrice():
|
|||
|
||||
|
||||
def sharepoint_data_for_installer(self, installer):
|
||||
|
||||
sp = SharePointScraper(installer)
|
||||
file_paths = sp.download_file_for_each_address()
|
||||
surveys = []
|
||||
|
|
@ -173,7 +174,7 @@ class SurveyPrice():
|
|||
"SHAREPOINT FLOOR_AREA_BANDING": "NO PRE SITE NOTES FOUND",
|
||||
"SHAREPOINT PRE_INSTALL_SAP_SCORE": "NO PRE SITE NOTES FOUND",
|
||||
"SHAREPOINT INSULATION MATERIAL": None,
|
||||
"SHAREPOINT ADDRESS": address
|
||||
"SHAREPOINT ADDRESS": surveyInfo.address
|
||||
}
|
||||
|
||||
if surveyInfo.pre_site_note:
|
||||
|
|
@ -231,14 +232,24 @@ class SurveyPrice():
|
|||
raise RuntimeError("No information found from Hubspot")
|
||||
|
||||
# Standardise address
|
||||
self.all_survey_info_from_sharepoint['clean_address'] = self.all_survey_info_from_sharepoint['SHAREPOINT ADDRESS'].apply(
|
||||
lambda x: x.lower().replace(',', '').strip()
|
||||
)
|
||||
def extract_start_and_postcode(addr):
|
||||
if not isinstance(addr, str) or addr.strip() == "":
|
||||
return "", ""
|
||||
parts = addr.lower().replace(",", "").strip().split()
|
||||
start = ' '.join(parts[:2]) # Number + street
|
||||
postcode = ' '.join(parts[-2:]) # Postcode
|
||||
return start, postcode
|
||||
|
||||
self.all_hubspot_submissions['clean_address'] = self.all_hubspot_submissions['HUBSPOT_DEAL_ADDRESS'].apply(
|
||||
lambda x: x.lower().replace(',', '').strip()
|
||||
# Extract start + postcode from both datasets
|
||||
self.all_survey_info_from_sharepoint[['address_start', 'postcode']] = self.all_survey_info_from_sharepoint['SHAREPOINT ADDRESS'].apply(
|
||||
lambda x: pd.Series(extract_start_and_postcode(x))
|
||||
)
|
||||
|
||||
self.all_hubspot_submissions[['address_start', 'postcode']] = self.all_hubspot_submissions['HUBSPOT_DEAL_ADDRESS'].apply(
|
||||
lambda x: pd.Series(extract_start_and_postcode(x))
|
||||
)
|
||||
|
||||
|
||||
# re-name to installer
|
||||
self.all_survey_info_from_sharepoint = self.all_survey_info_from_sharepoint.rename(
|
||||
columns={
|
||||
|
|
@ -254,14 +265,14 @@ class SurveyPrice():
|
|||
)
|
||||
|
||||
merged_df = pd.merge(
|
||||
self.all_survey_info_from_sharepoint,
|
||||
self.all_hubspot_submissions,
|
||||
left_on=['clean_address'],
|
||||
right_on=['clean_address'],
|
||||
self.all_survey_info_from_sharepoint,
|
||||
self.all_hubspot_submissions,
|
||||
on=['address_start', 'postcode'],
|
||||
how='inner'
|
||||
)
|
||||
|
||||
merged_df.drop(columns=['clean_address'], inplace=True)
|
||||
merged_df.drop(columns=['address_start', 'postcode'], inplace=True)
|
||||
|
||||
|
||||
def compute_energy_grant(row):
|
||||
pre_band_letter = row["SHAREPOINT PRE_INSTALL_SAP_SCORE_BANDING"][-1]
|
||||
|
|
@ -274,12 +285,14 @@ class SurveyPrice():
|
|||
|
||||
def work_type(row):
|
||||
if row["ENERGY_GRANT"] == "GBIS":
|
||||
return row["ENERGY GRANT"]
|
||||
return "GBIS"
|
||||
else:
|
||||
return f"{row["ENERGY_GRANT"]} - SAP {row["SHAREPOINT PRE_INSTALL_SAP_SCORE_BANDING"]} to {row["POST_INSTALL_SAP_SCORE_BANDING"]}"
|
||||
|
||||
|
||||
# Add missing variables
|
||||
if merged_df.size == 0:
|
||||
raise RuntimeError("no matched addresses with hubspot and sharepoint pre site notes")
|
||||
merged_df["ENERGY_GRANT"] = merged_df.apply(compute_energy_grant, axis=1)
|
||||
merged_df["POST_INSTALL_SAP_SCORE_BANDING"] = merged_df.apply(compute_banding_for_post_sap, axis=1)
|
||||
merged_df["WORK TYPE"] = merged_df.apply(work_type, axis=1)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue