changes made

This commit is contained in:
Jun-te Kim 2025-04-10 10:18:55 +00:00
parent a2c4cfedbe
commit 7b07edf8c4
4 changed files with 167 additions and 86 deletions

80
etl/filechecker.py Normal file
View file

@ -0,0 +1,80 @@
import os
os.environ["SHAREPOINT_CLIENT_ID"] = "895e3b77-b1d7-43ec-b18f-dcfe07cdfeaf"
os.environ["SHAREPOINT_CLIENT_SECRET"] = "SOf8Q~-is4wdQiqvEEm9FlJQRAY9ELGaj5Qz-a6E"
os.environ["SHAREPOINT_TENANT_ID"] = "c3f7519c-2719-4547-af04-6da6cbfd8f8f"
os.environ["SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID"] = "b5a51507-9427-4ee0-b03e-90ec7681e2d3"
os.environ["JJC_SERVICE_SHAREPOINT_ID"] = "7fdd0485-bbf3-4b29-b30f-98c81c2a6284"
from etl.scraper.scraper import SharePointScraper, SharePointInstaller, WEEK_COMMENCING
import pandas as pd
import hashlib
def calculate_sha256(bytes_io):
bytes_io.seek(0) # Make sure we're at the start
data = bytes_io.read()
return hashlib.sha256(data).hexdigest()
south_coast_scraper = SharePointScraper(SharePointInstaller.JJC)
folders = south_coast_scraper.get_folders_in_path('/')
list_of_file_names = []
for folder in folders['value']:
if "Khalim" in folder["name"]:
continue
elif ".Training" in folder["name"]:
continue
if 'file' not in folder:
list_of_file_names.append("/" + folder["name"])
list_of_dates = []
for folder in list_of_file_names:
dates = south_coast_scraper.get_folders_in_path(folder)
for date in dates['value']:
if 'file' not in date:
list_of_dates.append(folder + "/" + date["name"])
print(list_of_dates)
list_of_housing_associations = []
for folder in list_of_dates:
house_ass = south_coast_scraper.get_folders_in_path(folder)
for house in house_ass['value']:
if 'file' not in house:
list_of_housing_associations.append(folder + "/" + house["name"])
list_of_address = []
for folder in list_of_housing_associations:
address = south_coast_scraper.get_folders_in_path(folder)
for add in address['value']:
if 'file' not in add:
list_of_address.append(folder + "/" + add['name'])
list_of_pictures = []
for folder in list_of_address:
pictures = south_coast_scraper.get_folders_in_path(folder)
for pic in pictures['value']:
if 'file' not in pic:
list_of_pictures.append(folder + "/" + pic['name'])
print(list_of_pictures)
final_list = []
for files in list_of_pictures:
content = south_coast_scraper.get_folders_in_path(files)
for file in content['value']:
if 'file' in file:
url = file['@microsoft.graph.downloadUrl']
sha256 = calculate_sha256(south_coast_scraper.get_file_content(url))
final_list.append({
"Directories": files,
"Photo Name": file['name'],
"sha256": sha256,
})
final_df = pd.DataFrame(final_list)
final_df.to_csv("jjc.csv")

View file

@ -21,7 +21,6 @@ class HubSpotClient():
found_deals = []
after = None
while True:
print("hello world")
search_request = PublicObjectSearchRequest(
filter_groups=[{
"filters": [{
@ -36,7 +35,8 @@ class HubSpotClient():
"work_type",
"property_needs_trickle_vents",
"domna_survey_post_sap",
"existing_wall_insulation"
"existing_wall_insulation",
"installer",
],
limit=200,
after=after,
@ -46,7 +46,6 @@ class HubSpotClient():
if not response.paging or not response.paging.next:
break
after = response.paging.next.after
return found_deals
all_deals = []
if hasattr(found_deals, "results"):
@ -58,7 +57,8 @@ class HubSpotClient():
needs_trickle_ventilation=True if deal.properties["property_needs_trickle_vents"].upper() == "YES" else False,
post_sap_score=int(deal.properties["domna_survey_post_sap"]),
existing_wall_insulation=deal.properties["existing_wall_insulation"],
no_of_wet_rooms=int(deal.properties["number_of_wet_rooms_needing_ventilation"])
no_of_wet_rooms=int(deal.properties["number_of_wet_rooms_needing_ventilation"]),
installer=deal.properties["Installer"],
))
return all_deals
else:
@ -70,4 +70,4 @@ class HubSpotClient():
print(f"Pipeline: {pipeline.label}")
for stage in pipeline.stages:
print(f" - Label: {stage.label}")
print(f" ID: {stage.id}") #
print(f" ID: {stage.id}") #

View file

@ -17,4 +17,5 @@ class SubmissionInfoFromDeal(BaseModel):
needs_trickle_ventilation: bool
post_sap_score: int
existing_wall_insulation: str
no_of_wet_rooms: int
no_of_wet_rooms: int
installer: str

View file

@ -13,7 +13,7 @@ os.environ["JJC_SERVICE_SHAREPOINT_ID"] = "7fdd0485-bbf3-4b29-b30f-98c81c2a6284"
hubSpotClient = HubSpotClient()
deals = hubSpotClient.get_deals_from_deal_stage(DealStage.CUSTOMER_CONTACTED)
print(deals)
hubSpotClient.print_all_pipeline_ids()
csv_list = []
@ -26,99 +26,99 @@ for deal in deals:
"wetrooms": deal.no_of_wet_rooms,
"hubspot_wall_insulation": deal.existing_wall_insulation,
"POST INSTALL SAP SCORE": deal.post_sap_score,
"Installer": deal.installer,
})
hubspot_submissions = pd.DataFrame(csv_list)
price_empty = get_jjc_price_matrix()
price_foam = get_jjc_price_matrix("foam.csv")
price_general = get_jjc_price_matrix("general.csv")
total_price = []
# hubspot_submissions = pd.DataFrame(csv_list)
# price_empty = get_jjc_price_matrix()
# price_foam = get_jjc_price_matrix("foam.csv")
# price_general = get_jjc_price_matrix("general.csv")
# total_price = []
jjc = SharePointScraper(SharePointInstaller.JJC, development=True)
file_paths = jjc.download_file_for_each_address()
list_of_surveys = []
# jjc = SharePointScraper(SharePointInstaller.JJC, development=True)
# file_paths = jjc.download_file_for_each_address()
# list_of_surveys = []
for eachAddress in file_paths:
for address, files in eachAddress.items():
list_of_surveys.append(surveyedDataProcessor(address, files))
for survey in list_of_surveys:
if survey.pre_site_note:
floor_banding, total_floor_area = work_out_total_floor_area(survey.pre_site_note)
letter, number = survey.pre_site_note.survey_information.current_sap.split(" ")
pre_sap_score = number+letter
# for eachAddress in file_paths:
# for address, files in eachAddress.items():
# list_of_surveys.append(surveyedDataProcessor(address, files))
# for survey in list_of_surveys:
# if survey.pre_site_note:
# floor_banding, total_floor_area = work_out_total_floor_area(survey.pre_site_note)
# letter, number = survey.pre_site_note.survey_information.current_sap.split(" ")
# pre_sap_score = number+letter
address = survey.pre_site_note.survey_information.address.split(",")
address = [item.strip() for item in address][0]
filtered_df = hubspot_submissions[hubspot_submissions["Address"].apply(lambda x: address.upper() == x.split(",")[0].upper())]
if len(filtered_df) == 1:
funding_type = type_of_work(letter.upper(), get_band(filtered_df["POST INSTALL SAP SCORE"].values[0])[-1])
data = {
"Address": survey.pre_site_note.survey_information.address,
"HubSpot Address": filtered_df["Address"].values[0],
"Pre SAP from sharepoint": number,
"Post SAP from surveyor": filtered_df["POST INSTALL SAP SCORE"].values[0],
"Surveyor's Name": survey.pre_site_note.assessor_information.name,
"floor_area_group" : floor_banding,
"wetrooms" : filtered_df["wetrooms"].values[0],
"Trickle Vent" : filtered_df["Trickle Vent"].values[0],
"survey_stated_work_type": filtered_df["hubspot_work_type"].values[0],
}
# address = survey.pre_site_note.survey_information.address.split(",")
# address = [item.strip() for item in address][0]
# filtered_df = hubspot_submissions[hubspot_submissions["Address"].apply(lambda x: address.upper() == x.split(",")[0].upper())]
# if len(filtered_df) == 1:
# funding_type = type_of_work(letter.upper(), get_band(filtered_df["POST INSTALL SAP SCORE"].values[0])[-1])
# data = {
# "Address": survey.pre_site_note.survey_information.address,
# "HubSpot Address": filtered_df["Address"].values[0],
# "Pre SAP from sharepoint": number,
# "Post SAP from surveyor": filtered_df["POST INSTALL SAP SCORE"].values[0],
# "Surveyor's Name": survey.pre_site_note.assessor_information.name,
# "floor_area_group" : floor_banding,
# "wetrooms" : filtered_df["wetrooms"].values[0],
# "Trickle Vent" : filtered_df["Trickle Vent"].values[0],
# "survey_stated_work_type": filtered_df["hubspot_work_type"].values[0],
# }
csr_insulation = None
merged_df = pd.DataFrame()
if survey.csr:
if survey.csr.insulation_info:
csr_insulation = survey.csr.insulation_info.type.upper()
# csr_insulation = None
# merged_df = pd.DataFrame()
# if survey.csr:
# if survey.csr.insulation_info:
# csr_insulation = survey.csr.insulation_info.type.upper()
hubspot_wall_insulation = None
hubspot_wall_insulation = filtered_df["hubspot_wall_insulation"].values[0]
data.update({"csr_insulation": csr_insulation})
data.update({"hubspot_wall_insulation": hubspot_wall_insulation})
# hubspot_wall_insulation = None
# hubspot_wall_insulation = filtered_df["hubspot_wall_insulation"].values[0]
# data.update({"csr_insulation": csr_insulation})
# data.update({"hubspot_wall_insulation": hubspot_wall_insulation})
if funding_type == "GBIS":
if csr_insulation is None and hubspot_wall_insulation.upper() == "EMPTY":
data.update({"funding": funding_type.upper()})
df = pd.DataFrame([data])
merged_df = pd.merge(df, price_empty, on=['funding', 'Trickle Vent', 'floor_area_group', 'wetrooms'], how='left')
elif "FOAM" in csr_insulation.upper() and "FOAM" in hubspot_wall_insulation.upper():
data.update({"funding": funding_type.upper() + " Remedial"})
df = pd.DataFrame([data])
merged_df = pd.merge(df, price_foam, on=['funding', 'Trickle Vent', 'floor_area_group', 'wetrooms'], how='left')
else:
data.update({"funding": funding_type.upper() + " Remedial"})
df = pd.DataFrame([data])
merged_df = pd.merge(df, price_general, on=['funding', 'Trickle Vent', 'floor_area_group', 'wetrooms'], how='left')
elif funding_type == "ECO4":
if csr_insulation is None and hubspot_wall_insulation.upper() == "EMPTY":
formatted_funding_type = f"{funding_type.upper()} - SAP {get_band(int(number))} to {get_band(filtered_df["POST INSTALL SAP SCORE"].values[0])}"
data.update({"funding": formatted_funding_type})
df = pd.DataFrame([data])
merged_df = pd.merge(df, price_empty, on=['funding', 'Trickle Vent', 'floor_area_group', 'wetrooms'], how='left')
elif "FOAM" in csr_insulation.upper() and "FOAM" in hubspot_wall_insulation.upper():
formatted_funding_type = f"REMEDIAL - {funding_type.upper()} - SAP {get_band(int(number))} to {get_band(filtered_df["POST INSTALL SAP SCORE"].values[0])}"
data.update({"funding": formatted_funding_type})
df = pd.DataFrame([data])
merged_df = pd.merge(df, price_foam, on=['funding', 'Trickle Vent', 'floor_area_group', 'wetrooms'], how='left')
else:
formatted_funding_type = f"REMEDIAL - {funding_type.upper()} - SAP {get_band(int(number))} to {get_band(filtered_df["POST INSTALL SAP SCORE"].values[0])}"
data.update({"funding": formatted_funding_type})
df = pd.DataFrame([data])
merged_df = pd.merge(df, price_general, on=['funding', 'Trickle Vent', 'floor_area_group', 'wetrooms'], how='left')
else:
raise RuntimeError(f"UNKNOWN FUNDING TYPE {funding_type}")
# if funding_type == "GBIS":
# if csr_insulation is None and hubspot_wall_insulation.upper() == "EMPTY":
# data.update({"funding": funding_type.upper()})
# df = pd.DataFrame([data])
# merged_df = pd.merge(df, price_empty, on=['funding', 'Trickle Vent', 'floor_area_group', 'wetrooms'], how='left')
# elif "FOAM" in csr_insulation.upper() and "FOAM" in hubspot_wall_insulation.upper():
# data.update({"funding": funding_type.upper() + " Remedial"})
# df = pd.DataFrame([data])
# merged_df = pd.merge(df, price_foam, on=['funding', 'Trickle Vent', 'floor_area_group', 'wetrooms'], how='left')
# else:
# data.update({"funding": funding_type.upper() + " Remedial"})
# df = pd.DataFrame([data])
# merged_df = pd.merge(df, price_general, on=['funding', 'Trickle Vent', 'floor_area_group', 'wetrooms'], how='left')
# elif funding_type == "ECO4":
# if csr_insulation is None and hubspot_wall_insulation.upper() == "EMPTY":
# formatted_funding_type = f"{funding_type.upper()} - SAP {get_band(int(number))} to {get_band(filtered_df["POST INSTALL SAP SCORE"].values[0])}"
# data.update({"funding": formatted_funding_type})
# df = pd.DataFrame([data])
# merged_df = pd.merge(df, price_empty, on=['funding', 'Trickle Vent', 'floor_area_group', 'wetrooms'], how='left')
# elif "FOAM" in csr_insulation.upper() and "FOAM" in hubspot_wall_insulation.upper():
# formatted_funding_type = f"REMEDIAL - {funding_type.upper()} - SAP {get_band(int(number))} to {get_band(filtered_df["POST INSTALL SAP SCORE"].values[0])}"
# data.update({"funding": formatted_funding_type})
# df = pd.DataFrame([data])
# merged_df = pd.merge(df, price_foam, on=['funding', 'Trickle Vent', 'floor_area_group', 'wetrooms'], how='left')
# else:
# formatted_funding_type = f"REMEDIAL - {funding_type.upper()} - SAP {get_band(int(number))} to {get_band(filtered_df["POST INSTALL SAP SCORE"].values[0])}"
# data.update({"funding": formatted_funding_type})
# df = pd.DataFrame([data])
# merged_df = pd.merge(df, price_general, on=['funding', 'Trickle Vent', 'floor_area_group', 'wetrooms'], how='left')
# else:
# raise RuntimeError(f"UNKNOWN FUNDING TYPE {funding_type}")
if not merged_df.empty:
total_price.append(merged_df)
# if not merged_df.empty:
# total_price.append(merged_df)
final_df = pd.concat(total_price, ignore_index=True)
# final_df = pd.concat(total_price, ignore_index=True)
final_df.to_csv("survery_data.csv", index=False)
# final_df.to_csv("survery_data.csv", index=False)
print(f"WEEK COMMENCING {WEEK_COMMENCING}")
print("Excel file 'survey_data.xlsx' created successfully!")
# print(f"WEEK COMMENCING {WEEK_COMMENCING}")
# print("Excel file 'survey_data.xlsx' created successfully!")