mirror of
https://github.com/Hestia-Homes/survey-extraction.git
synced 2026-06-08 11:17:29 +00:00
scraper goes to each folder and finds the file path
This commit is contained in:
parent
392acf5a73
commit
164f3ba00a
3 changed files with 52 additions and 13 deletions
23
etl/main.py
23
etl/main.py
|
|
@ -13,30 +13,32 @@ INTERESTING_FILE_LOC = "/workspaces/survey-extraction/data/first last/Submission
|
|||
logger = Logger(name="main.py", level=logging.DEBUG).get_logger()
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
# POC PDF Reader
|
||||
# list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test()
|
||||
# pprint(list_)
|
||||
|
||||
#POC Scraper -> This part of the code get ths names of wrong format
|
||||
south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
|
||||
list_of_names = south_coast_scraper.list_of_names_that_has_the_wrong_date_format()
|
||||
logger.info(pformat(list_of_names))
|
||||
# south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
|
||||
# south_coast_scraper = SharePointScraper(SharePointInstaller.JJC_SERVICE)
|
||||
# list_of_names = south_coast_scraper.list_of_names_that_has_the_wrong_date_format()
|
||||
# logger.info(pformat(list_of_names))
|
||||
|
||||
# POC Scraper -> This part of the code gets every variation of housing_assocation names
|
||||
# # POC Scraper -> This part of the code gets every variation of housing_assocation names
|
||||
# south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
|
||||
# list_of_house_ass_names = south_coast_scraper.get_housing_association_names()
|
||||
# logger.info(pformat(list_of_house_ass_names))
|
||||
|
||||
# POC of work completed
|
||||
# south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
|
||||
# surverys_completed = south_coast_scraper.get_number_of_surverys_completed()
|
||||
# logger.info(pformat(surverys_completed))
|
||||
south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
|
||||
south_coast_scraper.download_file_for_each_address()
|
||||
|
||||
|
||||
# logger.info(south_coast_scraper.surveyor_to_housing_assosications)
|
||||
|
||||
|
||||
def print_hello():
|
||||
print("hello man")
|
||||
|
||||
|
||||
|
||||
def create_temp_file(dict_content):
|
||||
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt') as temp_file:
|
||||
|
|
@ -52,6 +54,7 @@ if __name__ == "__main__":
|
|||
|
||||
|
||||
# Get and read files
|
||||
# and make a quick verification it was two layers
|
||||
|
||||
# CSV file of the survey number
|
||||
|
||||
|
|
|
|||
|
|
@ -9,10 +9,11 @@ from functools import wraps
|
|||
import re
|
||||
from etl.validator.validator import DomnaSharePointValidator
|
||||
|
||||
WEEK_COMMENCING = os.getenv("WEEK_COMMENCING", "W.C. 24.02.2025")
|
||||
WEEK_COMMENCING = os.getenv("WEEK_COMMENCING", "W.C. 03.03.2025")
|
||||
|
||||
class SharePointInstaller(Enum):
|
||||
SOUTH_COAST_INSULATION_SERVICE = os.getenv("SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID", None)
|
||||
JJC_SERVICE = os.getenv("JJC_SERVICE_SHAREPOINT_ID", None)
|
||||
|
||||
class SharePointScraper():
|
||||
"""
|
||||
|
|
@ -41,6 +42,12 @@ class SharePointScraper():
|
|||
|
||||
self.surveyor_work_completed = {}
|
||||
|
||||
# Delete me for production
|
||||
self.surveyor_names = ['Abdul Koddus']
|
||||
self.surveyor_to_housing_assosications = {"Abdul Koddus":['Southern Housing']}
|
||||
self.surveyor_to_dates_folder = {'Abdul Koddus': ['W.C. 03.03.2025', 'W.C. 24.02.2025']}
|
||||
|
||||
|
||||
|
||||
|
||||
def ensure_surveyor_names_loaded(func):
|
||||
|
|
@ -144,7 +151,6 @@ class SharePointScraper():
|
|||
list_of_housing_association.append(house_ass_name)
|
||||
if house_ass_name not in self.house_association_names:
|
||||
self.house_association_names.append(house_ass_name)
|
||||
|
||||
self.surveyor_to_housing_assosications.update({name: list_of_housing_association})
|
||||
else:
|
||||
self.logger.warning(f"Failed to get housing association folder for {name}, {self.surveyor_to_dates_folder[name]}")
|
||||
|
|
@ -178,4 +184,33 @@ class SharePointScraper():
|
|||
|
||||
@ensure_housing_assosiation_is_loaded
|
||||
def download_file_for_each_address(self):
|
||||
pass
|
||||
for name in self.surveyor_names:
|
||||
if WEEK_COMMENCING in self.surveyor_to_dates_folder[name]:
|
||||
for house_ass in self.surveyor_to_housing_assosications[name]:
|
||||
address_files = self.get_folders_in_path(f"/{name}/{WEEK_COMMENCING}/{house_ass}")
|
||||
if 'value' not in address_files:
|
||||
raise RuntimeError("Failed to get files to download")
|
||||
else:
|
||||
allAddress = []
|
||||
for address in address_files['value']:
|
||||
if 'file' not in address:
|
||||
# Only directories
|
||||
allAddress.append(address['name'])
|
||||
|
||||
for address in allAddress:
|
||||
path = f"/{name}/{WEEK_COMMENCING}/{house_ass}/{address}"
|
||||
files_to_download_sharepoint_info = self.get_folders_in_path(path)
|
||||
if 'value' not in files_to_download_sharepoint_info:
|
||||
raise RuntimeError("Failed to get files to download")
|
||||
else:
|
||||
file_names_to_download = []
|
||||
avoid = [".jpg",".mov"]
|
||||
|
||||
for file in files_to_download_sharepoint_info['value']:
|
||||
if 'file' in file:
|
||||
file_names_to_download.append(file['name'])
|
||||
|
||||
filtered_files = [f for f in file_names_to_download if not f.endswith(tuple(avoid))]
|
||||
self.logger.warning(filtered_files)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -18,5 +18,6 @@ class Logger:
|
|||
# Add handlers to the logger
|
||||
self.logger.addHandler(c_handler)
|
||||
|
||||
|
||||
def get_logger(self):
|
||||
return self.logger
|
||||
Loading…
Add table
Reference in a new issue