Allows us to see a naughty list of name that need modification for dates

This commit is contained in:
Jun-te Kim 2025-03-05 18:04:41 +00:00
parent 70bc5d8b5a
commit 67bdd6b259
3 changed files with 24 additions and 48 deletions

View file

@ -6,15 +6,14 @@ import logging
import tempfile
from utils.logger import Logger
from etl.validator.validator import DomnaSharePointValidator
DATA_LOC = "/workspaces/survey-extraction/data/"
INTERESTING_FILE_LOC = "/workspaces/survey-extraction/data/first last/Submission 03.03.25/customer/10 Sandbeck Lane DN21 3LZ/PRE SITE NOTES.pdf"
logger = Logger(name="main.py", level=logging.DEBUG).get_logger()
def main():
# POC Validator
# RetroHomeFileStructureValidator(DATA_LOC)
# POC PDF Reader
#list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test()
#pprint(list_)
@ -22,43 +21,18 @@ def main():
# POC Scraper from sharepoint and get useful data
south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
list_of_names = south_coast_scraper.get_date_folder_names()
list_of_names = south_coast_scraper.list_of_names_that_has_the_wrong_date_format()
logger.info(pformat(list_of_names))
# # Get each name
# for survey_folder in installer_folders['value']:
# if survey_folder["name"] not in surverys_work_completed:
# surverys_work_completed.update({survey_folder["name"]: 0})
# for name, value in surverys_work_completed.items():
# housing_assosciation = sharepoint_client.list_folder_contents(f"/{name}")
# if 'value' in housing_assosciation:
# if len(housing_assosciation['value']) > 0:
# first_layer_folder_structure.update({name: housing_assosciation['value'][0]['name']})
# else:
# logger.info(f"Skipping 1 {name}")
# else:
# logger.info(f"Skipping 2 {name}")
# def create_temp_file(dict_content):
# with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt') as temp_file:
# formatted_content = pformat(dict_content)
# temp_file.write(formatted_content + "\n")
# temp_file_path = temp_file.name
# logger.info(pformat(first_layer_folder_structure))
# logger.info(pformat(surverys_work_completed))
def create_temp_file(dict_content):
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt') as temp_file:
formatted_content = pformat(dict_content)
temp_file.write(formatted_content + "\n")
temp_file_path = temp_file.name
logger.info(f"Temporary file created at: {temp_file_path}")
# logger.info(f"Temporary file created at: {temp_file_path}")
if __name__ == "__main__":

View file

@ -7,7 +7,7 @@ import logging
from utils.sharepoint.sharepoint import SharePointClient
from functools import wraps
import re
from validator.validator import DomnaSharePointValidator
from etl.validator.validator import DomnaSharePointValidator
class SharePointInstaller(Enum):
SOUTH_COAST_INSULATION_SERVICE = os.getenv("SOUTH_COAST_INSULATION_SERVICE_SHAREPOINT_ID", None)
@ -82,7 +82,6 @@ class SharePointScraper():
list_of_dates = []
for dates in dates_folders['value']:
self.logger.info(pformat(dates))
if 'name' in dates:
list_of_dates.append(dates['name'])
@ -104,12 +103,15 @@ class SharePointScraper():
@ensure_dates_folder_loaded
def list_of_names_that_has_the_wrong_date_format(self):
naughty_names = []
good_names = []
# Patten Nic wants: W.C. DD.MM.YYYY
pattern = r"^W\.C\. (0[1-9]|[12][0-9]|3[01])\.(0[1-9]|1[0-2])\.(\d{4})$"
for names, dates in self.surveyor_to_dates_folder:
if
for s in test_strings:
match = re.match(pattern, s)
print(f"{s}: {'Valid' if match else 'Invalid'}")
for name, dates in self.surveyor_to_dates_folder.items():
self.logger.info(dates)
if DomnaSharePointValidator.valid_dates(dates):
good_names.append(name)
else:
naughty_names.append(name)
self.logger.info(f"Dates are correct for the following folders {good_names}")
self.logger.info(f"Dates are wrong for the following folders {naughty_names}")
return naughty_names

View file

@ -10,13 +10,13 @@ class DomnaSharePointValidator():
def __init__(self):
self.logger = Logger(name='RetroHomeFileStructureValidator', level=logging.DEBUG).get_logger()
def valid_dates(self, list_of_dates_to_check):
def valid_dates(list_of_dates_to_check):
# Patten Nic wants: W.C. DD.MM.YYYY
pattern = r"^W\.C\. (0[1-9]|[12][0-9]|3[01])\.(0[1-9]|1[0-2])\.(\d{4})$"
for date in list_of_dates_to_check:
match = re.match(pattern, date)
if match is False:
if not re.match(pattern, date):
return False
return True