From 7c887d85312b1034cefee367dda887631486c2fa Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Mon, 17 Mar 2025 21:09:29 +0000 Subject: [PATCH] run it live and see what happens --- etl/daily_script.py | 26 +++++++++++++++++--------- etl/pdfReader/pdfReaderToText.py | 5 +++-- etl/pdfReader/reportType.py | 2 +- etl/scraper/scraper.py | 31 ++++++++++++++++++++++++++++--- etl/validator/validator.py | 10 ++++++++++ 5 files changed, 59 insertions(+), 15 deletions(-) diff --git a/etl/daily_script.py b/etl/daily_script.py index 27e3df1..96fdbc7 100644 --- a/etl/daily_script.py +++ b/etl/daily_script.py @@ -34,8 +34,10 @@ def main(): total_dict = dict(Counter(south_coast_submissions) + Counter(jjc_coast_submission) + Counter(sgec_submission) + Counter(BAXTER_KELLY_submissions)) - logger.info("-------------------------------------------") logger.info("Good morning Cyrus") + logger.info("-------------------------------------------") + logger.info("-------------WRONG DATE FORMAT-------------") + logger.info("-------------------------------------------") if south_coast_names: logger.info("South Coast with wrong date format:") logger.info(pformat(south_coast_names)) @@ -53,15 +55,21 @@ def main(): logger.info("Baxter Kelly with wrong date format") logger.info(pformat(b_names)) logger.info("-------------------------------------------") - logger.info(f"For week commencing: {WEEK_COMMENCING}") - logger.info(f"Submissions: {pformat(total_dict)}") + logger.info("------EACH PRE SITE NOTES SUBMISSIONS------") + logger.info("-------------------------------------------") + logger.info(f"For week commencing: {WEEK_COMMENCING}") + logger.info(f"South Coast Submissions: {pformat(south_coast_submissions)}") + logger.info(f"JJC: {pformat(jjc_coast_submission)}") + logger.info(f"SGEC Submissions: {pformat(sgec_submission)}") + logger.info(f"Baxter Kelly: {pformat(BAXTER_KELLY_submissions)}") + logger.info("-------------------------------------------") + logger.info("-----TOTAL PRE SITE NOTES SUBMISSIONS------") + logger.info("-------------------------------------------") + logger.info(f"For week commencing: {WEEK_COMMENCING}") + logger.info(f"Total Submissions: {pformat(total_dict)}") + logger.info("-------------------------------------------") + logger.info("---BROUGHT TO YOU BY THE DOMNA TECH TEAM---") logger.info("-------------------------------------------") - # Make a quick script that checks if the Pictures folder exists in a certain fail directory - - - # Make a cron job in github runner for Cyrus for this - - logger.info("Hope this helps! <3") if __name__ == "__main__": diff --git a/etl/pdfReader/pdfReaderToText.py b/etl/pdfReader/pdfReaderToText.py index a3ab42d..7ccca7a 100644 --- a/etl/pdfReader/pdfReaderToText.py +++ b/etl/pdfReader/pdfReaderToText.py @@ -30,10 +30,11 @@ class pdfReaderToText(): def get_file_type(self): if len(self.text_list) > 1: if "Quidos Ltd using Argyle software BRE approved calculator".lower() in self.text_list[0].lower(): - self.type = ReportType.QUIDOS_SITE_NOTE + self.type = ReportType.QUIDOS_PRESITE_NOTE return self.type else: - raise NotImplementedError("New type of file - please contact Jun-te Kim") + return None + # raise NotImplementedError("New type of file - please contact Jun-te Kim") def get_reader(self): self.get_file_type() diff --git a/etl/pdfReader/reportType.py b/etl/pdfReader/reportType.py index 09f11ef..6433d79 100644 --- a/etl/pdfReader/reportType.py +++ b/etl/pdfReader/reportType.py @@ -2,4 +2,4 @@ from enum import Enum class ReportType(Enum): - QUIDOS_SITE_NOTE = 1 \ No newline at end of file + QUIDOS_PRESITE_NOTE = 1 \ No newline at end of file diff --git a/etl/scraper/scraper.py b/etl/scraper/scraper.py index 06dc26e..1f4334a 100644 --- a/etl/scraper/scraper.py +++ b/etl/scraper/scraper.py @@ -193,12 +193,37 @@ class SharePointScraper(): def get_number_of_surverys_completed(self): for name in self.surveyor_names: if name in self.surveyor_to_housing_assosications: - for house_assosication in self.surveyor_to_housing_assosications[name]: - address_folders = self.get_folders_in_path(f"/{name}/{WEEK_COMMENCING}/{house_assosication}") + for house_ass in self.surveyor_to_housing_assosications[name]: + address_folders = self.get_folders_in_path(f"/{name}/{WEEK_COMMENCING}/{house_ass}") if 'value' not in address_folders: raise RuntimeError("Failed to get address folders") else: - self.surveyor_work_completed.update({name: len(address_folders['value'])}) + allAddress = [] + for address in address_folders['value']: + if 'file' not in address: + allAddress.append(address['name']) + + for address in allAddress: + path = f"/{name}/{WEEK_COMMENCING}/{house_ass}/{address}" + files_to_download_sharepoint_info = self.get_folders_in_path(path) + if 'value' not in files_to_download_sharepoint_info: + raise RuntimeError("Failed to get files to download") + else: + file_names_to_download = {} + only_pdf = [".pdf"] + for file in files_to_download_sharepoint_info['value']: + if 'file' in file: + if any(file["name"].endswith(ext) for ext in only_pdf): + file_names_to_download.update({file["name"]: file['@microsoft.graph.downloadUrl']}) + for file_name, url in file_names_to_download.items(): + content = self.get_file_content(url) + path = self.create_temp_file(content, f"{name}/{WEEK_COMMENCING}/{house_ass}/{address}/{file_name}") + if DomnaSharePointValidator.is_quidos_presite(path): + if name in self.surveyor_work_completed: + self.surveyor_work_completed[name] += 1 + else: + self.surveyor_work_completed.update({name: 1}) + break return self.surveyor_work_completed diff --git a/etl/validator/validator.py b/etl/validator/validator.py index 45f9822..760a663 100644 --- a/etl/validator/validator.py +++ b/etl/validator/validator.py @@ -2,6 +2,9 @@ import os import logging from etl.utils.logger import Logger import re +from etl.pdfReader.pdfReaderToText import pdfReaderToText +from etl.pdfReader.reportType import ReportType + class DomnaSharePointValidator(): """ @@ -21,6 +24,13 @@ class DomnaSharePointValidator(): return False return True + def is_quidos_presite(file_path): + file = pdfReaderToText(file_path) + type = file.get_file_type() + print(type) + return type == ReportType.QUIDOS_PRESITE_NOTE + + def valid_social_housing_associations(list_): """ Nic gave me a list of housing association names, will most likely use in future so leaving it here