mirror of
https://github.com/Hestia-Homes/survey-extraction.git
synced 2026-06-30 13:10:56 +00:00
added a scraper class to do some calculation outside of script
This commit is contained in:
parent
8751dfd9a1
commit
77e4f0f1ff
4 changed files with 90 additions and 20 deletions
|
|
@ -40,11 +40,8 @@ Currently working on:
|
|||
|
||||
MVP:
|
||||
Script we can run that will
|
||||
|
||||
Go to share point fetch all the data ( in progress )
|
||||
|
||||
provide some form of output
|
||||
|
||||
that shows the number of surverys done (Get this information!!!)
|
||||
|
||||
Flat table
|
||||
|
|
|
|||
|
|
@ -4,18 +4,11 @@ from pdfReader.pdfReaderToText import pdfReaderToText
|
|||
from utils.sharepoint.sharepoint import SharePointClient
|
||||
from pprint import pprint, pformat
|
||||
import logging
|
||||
import tempfile
|
||||
from utils.logger import Logger
|
||||
|
||||
DATA_LOC = "/workspaces/survey-extraction/data/"
|
||||
INTERESTING_FILE_LOC = "/workspaces/survey-extraction/data/first last/Submission 03.03.25/customer/10 Sandbeck Lane DN21 3LZ/PRE SITE NOTES.pdf"
|
||||
|
||||
SHAREPOINT_CLIENT_ID = os.getenv("SHAREPOINT_CLIENT_ID", None)
|
||||
SHAREPOINT_CLIENT_SECRET = os.getenv("SHAREPOINT_CLIENT_SECRET", None)
|
||||
SHAREPOINT_TENANT_ID = os.getenv("SHAREPOINT_TENANT_ID", None)
|
||||
SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID = os.getenv("SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID", None)
|
||||
week_commencing_folder_name = "WC 24-02-2025"
|
||||
|
||||
from utils.logger import Logger
|
||||
|
||||
logger = Logger(name="main.py", level=logging.DEBUG).get_logger()
|
||||
|
||||
|
||||
|
|
@ -27,6 +20,8 @@ def main():
|
|||
#list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test()
|
||||
#pprint(list_)
|
||||
|
||||
# POC Scraper from sharepoint and get useful data
|
||||
|
||||
sharepoint_client = SharePointClient(
|
||||
tenant_id=SHAREPOINT_TENANT_ID,
|
||||
client_id=SHAREPOINT_CLIENT_ID,
|
||||
|
|
@ -35,16 +30,44 @@ def main():
|
|||
)
|
||||
|
||||
installer_folders = sharepoint_client.list_folder_contents("/")
|
||||
#logger.info(pformat(installer_folders))
|
||||
|
||||
skipped = 0
|
||||
for installer_folder in installer_folders["value"]:
|
||||
if installer_folder["name"] == "Mark Billingham":
|
||||
logger.info(pformat(installer_folder))
|
||||
|
||||
surverys_work_completed = {}
|
||||
first_layer_folder_structure = {}
|
||||
|
||||
# Get each name
|
||||
for survey_folder in installer_folders['value']:
|
||||
if survey_folder["name"] not in surverys_work_completed:
|
||||
surverys_work_completed.update({survey_folder["name"]: 0})
|
||||
|
||||
for name, value in surverys_work_completed.items():
|
||||
housing_assosciation = sharepoint_client.list_folder_contents(f"/{name}")
|
||||
|
||||
if 'value' in housing_assosciation:
|
||||
if len(housing_assosciation['value']) > 0:
|
||||
first_layer_folder_structure.update({name: housing_assosciation['value'][0]['name']})
|
||||
else:
|
||||
logger.info(f"Skipping 1 {name}")
|
||||
else:
|
||||
skipped += 1
|
||||
logger.info(f"skipped {skipped}")
|
||||
logger.info(f"Skipping 2 {name}")
|
||||
|
||||
logger.info(pformat(first_layer_folder_structure))
|
||||
|
||||
|
||||
# logger.info(pformat(surverys_work_completed))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def create_temp_file(dict_content):
|
||||
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt') as temp_file:
|
||||
formatted_content = pformat(dict_content)
|
||||
temp_file.write(formatted_content + "\n")
|
||||
temp_file_path = temp_file.name
|
||||
|
||||
logger.info(f"Temporary file created at: {temp_file_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
25
etl/src/etl/scraper/__init__.py
Normal file
25
etl/src/etl/scraper/__init__.py
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
from enum import Enum
|
||||
import os
|
||||
|
||||
# Awesome Khalim's Sharepoint code that I'll probably will need to learn how it work in the future
|
||||
from utils.sharepoint.sharepoint import SharePointClient
|
||||
|
||||
class SharePointInstaller(Enum):
|
||||
SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID = os.getenv("SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID", None)
|
||||
|
||||
class SharePointScraper():
|
||||
"""
|
||||
A simple scraper to get the contents of a sharepoint and validatate inputs so I can manually change
|
||||
"""
|
||||
|
||||
def __init__(self, sharepoint_location):
|
||||
self.sharepoint_client_id = os.getenv("SHAREPOINT_CLIENT_ID", None)
|
||||
self.sharepoint_client_secret = os.getenv("SHAREPOINT_CLIENT_SECRET", None)
|
||||
self.sharepoint_tenant_id = os.getenv("SHAREPOINT_TENANT_ID", None)
|
||||
self.sharepoint_drive_id = os.getenv(sharepoint_location.value)
|
||||
|
||||
def get_folders_in_path("/")
|
||||
|
||||
|
||||
|
||||
|
||||
25
etl/src/etl/scraper/scraper.py
Normal file
25
etl/src/etl/scraper/scraper.py
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
from enum import Enum
|
||||
import os
|
||||
|
||||
# Awesome Khalim's Sharepoint code that I'll probably will need to learn how it work in the future
|
||||
from utils.sharepoint.sharepoint import SharePointClient
|
||||
|
||||
class SharePointInstaller(Enum):
|
||||
SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID = os.getenv("SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID", None)
|
||||
|
||||
class SharePointScraper():
|
||||
"""
|
||||
A simple scraper to get the contents of a sharepoint and validatate inputs so I can manually change
|
||||
"""
|
||||
|
||||
def __init__(self, sharepoint_location):
|
||||
self.sharepoint_client_id = os.getenv("SHAREPOINT_CLIENT_ID", None)
|
||||
self.sharepoint_client_secret = os.getenv("SHAREPOINT_CLIENT_SECRET", None)
|
||||
self.sharepoint_tenant_id = os.getenv("SHAREPOINT_TENANT_ID", None)
|
||||
self.sharepoint_drive_id = os.getenv(sharepoint_location.value)
|
||||
|
||||
def get_folders_in_path("/")
|
||||
|
||||
|
||||
|
||||
|
||||
Loading…
Add table
Reference in a new issue