added a scraper class to do some calculation outside of script

This commit is contained in:
Jun-te Kim 2025-03-05 14:00:56 +00:00
parent 8751dfd9a1
commit 77e4f0f1ff
4 changed files with 90 additions and 20 deletions

View file

@ -40,11 +40,8 @@ Currently working on:
MVP:
Script we can run that will
Go to share point fetch all the data ( in progress )
provide some form of output
that shows the number of surverys done (Get this information!!!)
Flat table

View file

@ -4,18 +4,11 @@ from pdfReader.pdfReaderToText import pdfReaderToText
from utils.sharepoint.sharepoint import SharePointClient
from pprint import pprint, pformat
import logging
import tempfile
from utils.logger import Logger
DATA_LOC = "/workspaces/survey-extraction/data/"
INTERESTING_FILE_LOC = "/workspaces/survey-extraction/data/first last/Submission 03.03.25/customer/10 Sandbeck Lane DN21 3LZ/PRE SITE NOTES.pdf"
SHAREPOINT_CLIENT_ID = os.getenv("SHAREPOINT_CLIENT_ID", None)
SHAREPOINT_CLIENT_SECRET = os.getenv("SHAREPOINT_CLIENT_SECRET", None)
SHAREPOINT_TENANT_ID = os.getenv("SHAREPOINT_TENANT_ID", None)
SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID = os.getenv("SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID", None)
week_commencing_folder_name = "WC 24-02-2025"
from utils.logger import Logger
logger = Logger(name="main.py", level=logging.DEBUG).get_logger()
@ -27,6 +20,8 @@ def main():
#list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test()
#pprint(list_)
# POC Scraper from sharepoint and get useful data
sharepoint_client = SharePointClient(
tenant_id=SHAREPOINT_TENANT_ID,
client_id=SHAREPOINT_CLIENT_ID,
@ -35,16 +30,44 @@ def main():
)
installer_folders = sharepoint_client.list_folder_contents("/")
#logger.info(pformat(installer_folders))
skipped = 0
for installer_folder in installer_folders["value"]:
if installer_folder["name"] == "Mark Billingham":
logger.info(pformat(installer_folder))
surverys_work_completed = {}
first_layer_folder_structure = {}
# Get each name
for survey_folder in installer_folders['value']:
if survey_folder["name"] not in surverys_work_completed:
surverys_work_completed.update({survey_folder["name"]: 0})
for name, value in surverys_work_completed.items():
housing_assosciation = sharepoint_client.list_folder_contents(f"/{name}")
if 'value' in housing_assosciation:
if len(housing_assosciation['value']) > 0:
first_layer_folder_structure.update({name: housing_assosciation['value'][0]['name']})
else:
logger.info(f"Skipping 1 {name}")
else:
skipped += 1
logger.info(f"skipped {skipped}")
logger.info(f"Skipping 2 {name}")
logger.info(pformat(first_layer_folder_structure))
# logger.info(pformat(surverys_work_completed))
def create_temp_file(dict_content):
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt') as temp_file:
formatted_content = pformat(dict_content)
temp_file.write(formatted_content + "\n")
temp_file_path = temp_file.name
logger.info(f"Temporary file created at: {temp_file_path}")
if __name__ == "__main__":

View file

@ -0,0 +1,25 @@
from enum import Enum
import os
# Awesome Khalim's Sharepoint code that I'll probably will need to learn how it work in the future
from utils.sharepoint.sharepoint import SharePointClient
class SharePointInstaller(Enum):
SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID = os.getenv("SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID", None)
class SharePointScraper():
"""
A simple scraper to get the contents of a sharepoint and validatate inputs so I can manually change
"""
def __init__(self, sharepoint_location):
self.sharepoint_client_id = os.getenv("SHAREPOINT_CLIENT_ID", None)
self.sharepoint_client_secret = os.getenv("SHAREPOINT_CLIENT_SECRET", None)
self.sharepoint_tenant_id = os.getenv("SHAREPOINT_TENANT_ID", None)
self.sharepoint_drive_id = os.getenv(sharepoint_location.value)
def get_folders_in_path("/")

View file

@ -0,0 +1,25 @@
from enum import Enum
import os
# Awesome Khalim's Sharepoint code that I'll probably will need to learn how it work in the future
from utils.sharepoint.sharepoint import SharePointClient
class SharePointInstaller(Enum):
SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID = os.getenv("SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID", None)
class SharePointScraper():
"""
A simple scraper to get the contents of a sharepoint and validatate inputs so I can manually change
"""
def __init__(self, sharepoint_location):
self.sharepoint_client_id = os.getenv("SHAREPOINT_CLIENT_ID", None)
self.sharepoint_client_secret = os.getenv("SHAREPOINT_CLIENT_SECRET", None)
self.sharepoint_tenant_id = os.getenv("SHAREPOINT_TENANT_ID", None)
self.sharepoint_drive_id = os.getenv(sharepoint_location.value)
def get_folders_in_path("/")