diff --git a/etl/src/etl/main.py b/etl/src/etl/main.py index dc46a1e..ef71c46 100644 --- a/etl/src/etl/main.py +++ b/etl/src/etl/main.py @@ -15,14 +15,14 @@ logger = Logger(name="main.py", level=logging.DEBUG).get_logger() def main(): # POC PDF Reader - #list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test() - #pprint(list_) + list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test() + pprint(list_) # POC Scraper from sharepoint and get useful data - south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE) - list_of_names = south_coast_scraper.list_of_names_that_has_the_wrong_date_format() - logger.info(pformat(list_of_names)) + # south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE) + # list_of_names = south_coast_scraper.list_of_names_that_has_the_wrong_date_format() + # logger.info(pformat(list_of_names)) @@ -37,3 +37,7 @@ def main(): if __name__ == "__main__": main() + +# Khalim would like these metrics from the pdf +# address, uprn, assessor's name validation, current sap rating, current annual emissions. DImension +# \ No newline at end of file diff --git a/etl/src/etl/validator/validator.py b/etl/src/etl/validator/validator.py index c6c2594..0f0e820 100644 --- a/etl/src/etl/validator/validator.py +++ b/etl/src/etl/validator/validator.py @@ -12,6 +12,7 @@ class DomnaSharePointValidator(): def valid_dates(list_of_dates_to_check): # Patten Nic wants: W.C. DD.MM.YYYY + # TODO: Ideally split the date and W.C. and use a datatime library so its more standardised pattern = r"^W\.C\. (0[1-9]|[12][0-9]|3[01])\.(0[1-9]|1[0-2])\.(\d{4})$"