scraper goes to each folder and finds the file path

This commit is contained in:
Jun-te Kim 2025-03-10 16:03:51 +00:00
parent 392acf5a73
commit 164f3ba00a
3 changed files with 52 additions and 13 deletions

View file

@ -13,30 +13,32 @@ INTERESTING_FILE_LOC = "/workspaces/survey-extraction/data/first last/Submission
logger = Logger(name="main.py", level=logging.DEBUG).get_logger()
def main():
# POC PDF Reader
# list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test()
# pprint(list_)
#POC Scraper -> This part of the code get ths names of wrong format
south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
list_of_names = south_coast_scraper.list_of_names_that_has_the_wrong_date_format()
logger.info(pformat(list_of_names))
# south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
# south_coast_scraper = SharePointScraper(SharePointInstaller.JJC_SERVICE)
# list_of_names = south_coast_scraper.list_of_names_that_has_the_wrong_date_format()
# logger.info(pformat(list_of_names))
# POC Scraper -> This part of the code gets every variation of housing_assocation names
# # POC Scraper -> This part of the code gets every variation of housing_assocation names
# south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
# list_of_house_ass_names = south_coast_scraper.get_housing_association_names()
# logger.info(pformat(list_of_house_ass_names))
# POC of work completed
# south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
# surverys_completed = south_coast_scraper.get_number_of_surverys_completed()
# logger.info(pformat(surverys_completed))
south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
south_coast_scraper.download_file_for_each_address()
# logger.info(south_coast_scraper.surveyor_to_housing_assosications)
def print_hello():
print("hello man")
def create_temp_file(dict_content):
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt') as temp_file:
@ -52,6 +54,7 @@ if __name__ == "__main__":
# Get and read files
# and make a quick verification it was two layers
# CSV file of the survey number

View file

@ -9,10 +9,11 @@ from functools import wraps
import re
from etl.validator.validator import DomnaSharePointValidator
WEEK_COMMENCING = os.getenv("WEEK_COMMENCING", "W.C. 24.02.2025")
WEEK_COMMENCING = os.getenv("WEEK_COMMENCING", "W.C. 03.03.2025")
class SharePointInstaller(Enum):
SOUTH_COAST_INSULATION_SERVICE = os.getenv("SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID", None)
JJC_SERVICE = os.getenv("JJC_SERVICE_SHAREPOINT_ID", None)
class SharePointScraper():
"""
@ -41,6 +42,12 @@ class SharePointScraper():
self.surveyor_work_completed = {}
# Delete me for production
self.surveyor_names = ['Abdul Koddus']
self.surveyor_to_housing_assosications = {"Abdul Koddus":['Southern Housing']}
self.surveyor_to_dates_folder = {'Abdul Koddus': ['W.C. 03.03.2025', 'W.C. 24.02.2025']}
def ensure_surveyor_names_loaded(func):
@ -144,7 +151,6 @@ class SharePointScraper():
list_of_housing_association.append(house_ass_name)
if house_ass_name not in self.house_association_names:
self.house_association_names.append(house_ass_name)
self.surveyor_to_housing_assosications.update({name: list_of_housing_association})
else:
self.logger.warning(f"Failed to get housing association folder for {name}, {self.surveyor_to_dates_folder[name]}")
@ -178,4 +184,33 @@ class SharePointScraper():
@ensure_housing_assosiation_is_loaded
def download_file_for_each_address(self):
pass
for name in self.surveyor_names:
if WEEK_COMMENCING in self.surveyor_to_dates_folder[name]:
for house_ass in self.surveyor_to_housing_assosications[name]:
address_files = self.get_folders_in_path(f"/{name}/{WEEK_COMMENCING}/{house_ass}")
if 'value' not in address_files:
raise RuntimeError("Failed to get files to download")
else:
allAddress = []
for address in address_files['value']:
if 'file' not in address:
# Only directories
allAddress.append(address['name'])
for address in allAddress:
path = f"/{name}/{WEEK_COMMENCING}/{house_ass}/{address}"
files_to_download_sharepoint_info = self.get_folders_in_path(path)
if 'value' not in files_to_download_sharepoint_info:
raise RuntimeError("Failed to get files to download")
else:
file_names_to_download = []
avoid = [".jpg",".mov"]
for file in files_to_download_sharepoint_info['value']:
if 'file' in file:
file_names_to_download.append(file['name'])
filtered_files = [f for f in file_names_to_download if not f.endswith(tuple(avoid))]
self.logger.warning(filtered_files)

View file

@ -18,5 +18,6 @@ class Logger:
# Add handlers to the logger
self.logger.addHandler(c_handler)
def get_logger(self):
return self.logger