diff --git a/etl/main.py b/etl/main.py index 6fb777c..1083233 100644 --- a/etl/main.py +++ b/etl/main.py @@ -13,30 +13,32 @@ INTERESTING_FILE_LOC = "/workspaces/survey-extraction/data/first last/Submission logger = Logger(name="main.py", level=logging.DEBUG).get_logger() - def main(): # POC PDF Reader # list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test() # pprint(list_) #POC Scraper -> This part of the code get ths names of wrong format - south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE) - list_of_names = south_coast_scraper.list_of_names_that_has_the_wrong_date_format() - logger.info(pformat(list_of_names)) + # south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE) + # south_coast_scraper = SharePointScraper(SharePointInstaller.JJC_SERVICE) + # list_of_names = south_coast_scraper.list_of_names_that_has_the_wrong_date_format() + # logger.info(pformat(list_of_names)) - # POC Scraper -> This part of the code gets every variation of housing_assocation names + # # POC Scraper -> This part of the code gets every variation of housing_assocation names # south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE) # list_of_house_ass_names = south_coast_scraper.get_housing_association_names() # logger.info(pformat(list_of_house_ass_names)) # POC of work completed - # south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE) - # surverys_completed = south_coast_scraper.get_number_of_surverys_completed() - # logger.info(pformat(surverys_completed)) + south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE) + south_coast_scraper.download_file_for_each_address() + + + # logger.info(south_coast_scraper.surveyor_to_housing_assosications) -def print_hello(): - print("hello man") + + def create_temp_file(dict_content): with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt') as temp_file: @@ -52,6 +54,7 @@ if __name__ == "__main__": # Get and read files +# and make a quick verification it was two layers # CSV file of the survey number diff --git a/etl/scraper/scraper.py b/etl/scraper/scraper.py index 0f343c1..824d3ea 100644 --- a/etl/scraper/scraper.py +++ b/etl/scraper/scraper.py @@ -9,10 +9,11 @@ from functools import wraps import re from etl.validator.validator import DomnaSharePointValidator -WEEK_COMMENCING = os.getenv("WEEK_COMMENCING", "W.C. 24.02.2025") +WEEK_COMMENCING = os.getenv("WEEK_COMMENCING", "W.C. 03.03.2025") class SharePointInstaller(Enum): SOUTH_COAST_INSULATION_SERVICE = os.getenv("SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID", None) + JJC_SERVICE = os.getenv("JJC_SERVICE_SHAREPOINT_ID", None) class SharePointScraper(): """ @@ -41,6 +42,12 @@ class SharePointScraper(): self.surveyor_work_completed = {} + # Delete me for production + self.surveyor_names = ['Abdul Koddus'] + self.surveyor_to_housing_assosications = {"Abdul Koddus":['Southern Housing']} + self.surveyor_to_dates_folder = {'Abdul Koddus': ['W.C. 03.03.2025', 'W.C. 24.02.2025']} + + def ensure_surveyor_names_loaded(func): @@ -144,7 +151,6 @@ class SharePointScraper(): list_of_housing_association.append(house_ass_name) if house_ass_name not in self.house_association_names: self.house_association_names.append(house_ass_name) - self.surveyor_to_housing_assosications.update({name: list_of_housing_association}) else: self.logger.warning(f"Failed to get housing association folder for {name}, {self.surveyor_to_dates_folder[name]}") @@ -178,4 +184,33 @@ class SharePointScraper(): @ensure_housing_assosiation_is_loaded def download_file_for_each_address(self): - pass \ No newline at end of file + for name in self.surveyor_names: + if WEEK_COMMENCING in self.surveyor_to_dates_folder[name]: + for house_ass in self.surveyor_to_housing_assosications[name]: + address_files = self.get_folders_in_path(f"/{name}/{WEEK_COMMENCING}/{house_ass}") + if 'value' not in address_files: + raise RuntimeError("Failed to get files to download") + else: + allAddress = [] + for address in address_files['value']: + if 'file' not in address: + # Only directories + allAddress.append(address['name']) + + for address in allAddress: + path = f"/{name}/{WEEK_COMMENCING}/{house_ass}/{address}" + files_to_download_sharepoint_info = self.get_folders_in_path(path) + if 'value' not in files_to_download_sharepoint_info: + raise RuntimeError("Failed to get files to download") + else: + file_names_to_download = [] + avoid = [".jpg",".mov"] + + for file in files_to_download_sharepoint_info['value']: + if 'file' in file: + file_names_to_download.append(file['name']) + + filtered_files = [f for f in file_names_to_download if not f.endswith(tuple(avoid))] + self.logger.warning(filtered_files) + + diff --git a/etl/utils/logger.py b/etl/utils/logger.py index b31375f..62970c6 100644 --- a/etl/utils/logger.py +++ b/etl/utils/logger.py @@ -18,5 +18,6 @@ class Logger: # Add handlers to the logger self.logger.addHandler(c_handler) + def get_logger(self): return self.logger \ No newline at end of file