diff --git a/etl/src/etl/main.py b/etl/src/etl/main.py index 1dea61e..dc46a1e 100644 --- a/etl/src/etl/main.py +++ b/etl/src/etl/main.py @@ -6,15 +6,14 @@ import logging import tempfile from utils.logger import Logger +from etl.validator.validator import DomnaSharePointValidator + DATA_LOC = "/workspaces/survey-extraction/data/" INTERESTING_FILE_LOC = "/workspaces/survey-extraction/data/first last/Submission 03.03.25/customer/10 Sandbeck Lane DN21 3LZ/PRE SITE NOTES.pdf" logger = Logger(name="main.py", level=logging.DEBUG).get_logger() def main(): - # POC Validator - # RetroHomeFileStructureValidator(DATA_LOC) - # POC PDF Reader #list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test() #pprint(list_) @@ -22,43 +21,18 @@ def main(): # POC Scraper from sharepoint and get useful data south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE) - list_of_names = south_coast_scraper.get_date_folder_names() + list_of_names = south_coast_scraper.list_of_names_that_has_the_wrong_date_format() logger.info(pformat(list_of_names)) - # # Get each name - # for survey_folder in installer_folders['value']: - # if survey_folder["name"] not in surverys_work_completed: - # surverys_work_completed.update({survey_folder["name"]: 0}) - # for name, value in surverys_work_completed.items(): - # housing_assosciation = sharepoint_client.list_folder_contents(f"/{name}") - # if 'value' in housing_assosciation: - # if len(housing_assosciation['value']) > 0: - # first_layer_folder_structure.update({name: housing_assosciation['value'][0]['name']}) - # else: - # logger.info(f"Skipping 1 {name}") - # else: - # logger.info(f"Skipping 2 {name}") +# def create_temp_file(dict_content): +# with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt') as temp_file: +# formatted_content = pformat(dict_content) +# temp_file.write(formatted_content + "\n") +# temp_file_path = temp_file.name - # logger.info(pformat(first_layer_folder_structure)) - - - # logger.info(pformat(surverys_work_completed)) - - - - - - - -def create_temp_file(dict_content): - with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt') as temp_file: - formatted_content = pformat(dict_content) - temp_file.write(formatted_content + "\n") - temp_file_path = temp_file.name - - logger.info(f"Temporary file created at: {temp_file_path}") +# logger.info(f"Temporary file created at: {temp_file_path}") if __name__ == "__main__": diff --git a/etl/src/etl/scraper/scraper.py b/etl/src/etl/scraper/scraper.py index b6c34dd..c88baa2 100644 --- a/etl/src/etl/scraper/scraper.py +++ b/etl/src/etl/scraper/scraper.py @@ -7,7 +7,7 @@ import logging from utils.sharepoint.sharepoint import SharePointClient from functools import wraps import re -from validator.validator import DomnaSharePointValidator +from etl.validator.validator import DomnaSharePointValidator class SharePointInstaller(Enum): SOUTH_COAST_INSULATION_SERVICE = os.getenv("SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID", None) @@ -82,7 +82,6 @@ class SharePointScraper(): list_of_dates = [] for dates in dates_folders['value']: - self.logger.info(pformat(dates)) if 'name' in dates: list_of_dates.append(dates['name']) @@ -104,12 +103,15 @@ class SharePointScraper(): @ensure_dates_folder_loaded def list_of_names_that_has_the_wrong_date_format(self): naughty_names = [] + good_names = [] - # Patten Nic wants: W.C. DD.MM.YYYY - pattern = r"^W\.C\. (0[1-9]|[12][0-9]|3[01])\.(0[1-9]|1[0-2])\.(\d{4})$" - - for names, dates in self.surveyor_to_dates_folder: - if - for s in test_strings: - match = re.match(pattern, s) - print(f"{s}: {'Valid' if match else 'Invalid'}") + for name, dates in self.surveyor_to_dates_folder.items(): + self.logger.info(dates) + if DomnaSharePointValidator.valid_dates(dates): + good_names.append(name) + else: + naughty_names.append(name) + + self.logger.info(f"Dates are correct for the following folders {good_names}") + self.logger.info(f"Dates are wrong for the following folders {naughty_names}") + return naughty_names diff --git a/etl/src/etl/validator/validator b/etl/src/etl/validator/validator.py similarity index 82% rename from etl/src/etl/validator/validator rename to etl/src/etl/validator/validator.py index 9204b58..c6c2594 100644 --- a/etl/src/etl/validator/validator +++ b/etl/src/etl/validator/validator.py @@ -10,13 +10,13 @@ class DomnaSharePointValidator(): def __init__(self): self.logger = Logger(name='RetroHomeFileStructureValidator', level=logging.DEBUG).get_logger() - def valid_dates(self, list_of_dates_to_check): + def valid_dates(list_of_dates_to_check): # Patten Nic wants: W.C. DD.MM.YYYY pattern = r"^W\.C\. (0[1-9]|[12][0-9]|3[01])\.(0[1-9]|1[0-2])\.(\d{4})$" + for date in list_of_dates_to_check: - match = re.match(pattern, date) - if match is False: + if not re.match(pattern, date): return False return True