From 77e4f0f1ff92e330e449fb4a0db752339901c750 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 5 Mar 2025 14:00:56 +0000 Subject: [PATCH] added a scraper class to do some calculation outside of script --- etl/README.md | 3 -- etl/src/etl/main.py | 57 +++++++++++++++++++++++---------- etl/src/etl/scraper/__init__.py | 25 +++++++++++++++ etl/src/etl/scraper/scraper.py | 25 +++++++++++++++ 4 files changed, 90 insertions(+), 20 deletions(-) create mode 100644 etl/src/etl/scraper/__init__.py create mode 100644 etl/src/etl/scraper/scraper.py diff --git a/etl/README.md b/etl/README.md index 8b384e7..17edafe 100644 --- a/etl/README.md +++ b/etl/README.md @@ -40,11 +40,8 @@ Currently working on: MVP: Script we can run that will - Go to share point fetch all the data ( in progress ) - provide some form of output - that shows the number of surverys done (Get this information!!!) Flat table diff --git a/etl/src/etl/main.py b/etl/src/etl/main.py index 6520527..83ee442 100644 --- a/etl/src/etl/main.py +++ b/etl/src/etl/main.py @@ -4,18 +4,11 @@ from pdfReader.pdfReaderToText import pdfReaderToText from utils.sharepoint.sharepoint import SharePointClient from pprint import pprint, pformat import logging +import tempfile +from utils.logger import Logger DATA_LOC = "/workspaces/survey-extraction/data/" INTERESTING_FILE_LOC = "/workspaces/survey-extraction/data/first last/Submission 03.03.25/customer/10 Sandbeck Lane DN21 3LZ/PRE SITE NOTES.pdf" - -SHAREPOINT_CLIENT_ID = os.getenv("SHAREPOINT_CLIENT_ID", None) -SHAREPOINT_CLIENT_SECRET = os.getenv("SHAREPOINT_CLIENT_SECRET", None) -SHAREPOINT_TENANT_ID = os.getenv("SHAREPOINT_TENANT_ID", None) -SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID = os.getenv("SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID", None) -week_commencing_folder_name = "WC 24-02-2025" - -from utils.logger import Logger - logger = Logger(name="main.py", level=logging.DEBUG).get_logger() @@ -27,6 +20,8 @@ def main(): #list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test() #pprint(list_) + # POC Scraper from sharepoint and get useful data + sharepoint_client = SharePointClient( tenant_id=SHAREPOINT_TENANT_ID, client_id=SHAREPOINT_CLIENT_ID, @@ -35,16 +30,44 @@ def main(): ) installer_folders = sharepoint_client.list_folder_contents("/") - #logger.info(pformat(installer_folders)) - skipped = 0 - for installer_folder in installer_folders["value"]: - if installer_folder["name"] == "Mark Billingham": - logger.info(pformat(installer_folder)) - + surverys_work_completed = {} + first_layer_folder_structure = {} + + # Get each name + for survey_folder in installer_folders['value']: + if survey_folder["name"] not in surverys_work_completed: + surverys_work_completed.update({survey_folder["name"]: 0}) + + for name, value in surverys_work_completed.items(): + housing_assosciation = sharepoint_client.list_folder_contents(f"/{name}") + + if 'value' in housing_assosciation: + if len(housing_assosciation['value']) > 0: + first_layer_folder_structure.update({name: housing_assosciation['value'][0]['name']}) + else: + logger.info(f"Skipping 1 {name}") else: - skipped += 1 - logger.info(f"skipped {skipped}") + logger.info(f"Skipping 2 {name}") + + logger.info(pformat(first_layer_folder_structure)) + + + # logger.info(pformat(surverys_work_completed)) + + + + + + + +def create_temp_file(dict_content): + with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt') as temp_file: + formatted_content = pformat(dict_content) + temp_file.write(formatted_content + "\n") + temp_file_path = temp_file.name + + logger.info(f"Temporary file created at: {temp_file_path}") if __name__ == "__main__": diff --git a/etl/src/etl/scraper/__init__.py b/etl/src/etl/scraper/__init__.py new file mode 100644 index 0000000..008d446 --- /dev/null +++ b/etl/src/etl/scraper/__init__.py @@ -0,0 +1,25 @@ +from enum import Enum +import os + +# Awesome Khalim's Sharepoint code that I'll probably will need to learn how it work in the future +from utils.sharepoint.sharepoint import SharePointClient + +class SharePointInstaller(Enum): + SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID = os.getenv("SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID", None) + +class SharePointScraper(): + """ + A simple scraper to get the contents of a sharepoint and validatate inputs so I can manually change + """ + + def __init__(self, sharepoint_location): + self.sharepoint_client_id = os.getenv("SHAREPOINT_CLIENT_ID", None) + self.sharepoint_client_secret = os.getenv("SHAREPOINT_CLIENT_SECRET", None) + self.sharepoint_tenant_id = os.getenv("SHAREPOINT_TENANT_ID", None) + self.sharepoint_drive_id = os.getenv(sharepoint_location.value) + + def get_folders_in_path("/") + + + + diff --git a/etl/src/etl/scraper/scraper.py b/etl/src/etl/scraper/scraper.py new file mode 100644 index 0000000..008d446 --- /dev/null +++ b/etl/src/etl/scraper/scraper.py @@ -0,0 +1,25 @@ +from enum import Enum +import os + +# Awesome Khalim's Sharepoint code that I'll probably will need to learn how it work in the future +from utils.sharepoint.sharepoint import SharePointClient + +class SharePointInstaller(Enum): + SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID = os.getenv("SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID", None) + +class SharePointScraper(): + """ + A simple scraper to get the contents of a sharepoint and validatate inputs so I can manually change + """ + + def __init__(self, sharepoint_location): + self.sharepoint_client_id = os.getenv("SHAREPOINT_CLIENT_ID", None) + self.sharepoint_client_secret = os.getenv("SHAREPOINT_CLIENT_SECRET", None) + self.sharepoint_tenant_id = os.getenv("SHAREPOINT_TENANT_ID", None) + self.sharepoint_drive_id = os.getenv(sharepoint_location.value) + + def get_folders_in_path("/") + + + +