From 066601f325d0744d41af75c9360a57cd88a3687e Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 13 Mar 2025 06:38:46 +0000 Subject: [PATCH] modulised the functions of work I'm doing --- etl/{main.py => daily_script.py} | 31 +++++------------------------- etl/deemed_report_generator.py | 30 +++++++++++++++++++++++++++++ etl/development.py | 33 ++++++++++++++++++++++++++++++++ etl/scraper/scraper.py | 9 +++++---- run_etl.sh | 2 +- 5 files changed, 74 insertions(+), 31 deletions(-) rename etl/{main.py => daily_script.py} (58%) create mode 100644 etl/deemed_report_generator.py create mode 100644 etl/development.py diff --git a/etl/main.py b/etl/daily_script.py similarity index 58% rename from etl/main.py rename to etl/daily_script.py index a9ac4cb..1e63b5c 100644 --- a/etl/main.py +++ b/etl/daily_script.py @@ -17,16 +17,15 @@ doc1 = pdfReader2.get_reader() vars(doc1) def main(): - pass # POC PDF Reader # list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test() # pprint(list_) - #POC Scraper -> This part of the code get ths names of wrong format - # south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE) - # south_coast_scraper = SharePointScraper(SharePointInstaller.JJC_SERVICE) - # list_of_names = south_coast_scraper.list_of_names_that_has_the_wrong_date_format() - # logger.info(pformat(list_of_names)) + POC Scraper -> This part of the code get ths names of wrong format + south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE) + south_coast_scraper = SharePointScraper(SharePointInstaller.JJC_SERVICE) + list_of_names = south_coast_scraper.list_of_names_that_has_the_wrong_date_format() + logger.info(pformat(list_of_names)) # # POC Scraper -> This part of the code gets every variation of housing_assocation names # south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE) @@ -43,23 +42,3 @@ def main(): if __name__ == "__main__": main() - - -# Get and read files -# and make a quick verification it was two layers - -# CSV file of the survey number - - -# Work out productivity metirc (number of address in submission folder, with at least one file included) - -# Khalim would like these metrics from the pdf -# address, uprn, assessor's name validation, current sap rating, current annual emissions. DImension - - - -# next week mvp -# Gather file data form sharepoint -# Start making a scheme or csv with the data we 'care about' -# {id: uuid, 'surveryor_name': 'khalim', 'address': '123 fake street', 'prorperty_type': 'house', sharepoint_loc} -# data base of historical data. SQL data diff --git a/etl/deemed_report_generator.py b/etl/deemed_report_generator.py new file mode 100644 index 0000000..fa76f75 --- /dev/null +++ b/etl/deemed_report_generator.py @@ -0,0 +1,30 @@ +import os +from pdfReader.pdfReaderToText import pdfReaderToText +from etl.scraper.scraper import SharePointScraper, SharePointInstaller +from pprint import pprint, pformat +import logging +from etl.utils.logger import Logger +from etl.validator.validator import DomnaSharePointValidator + +logger = Logger(name="main.py", level=logging.DEBUG).get_logger() +DATA_LOC_1 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/10 Turnberry Close TN38 0WL/PRE SITE NOTES.pdf" +DATA_LOC_2 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/16 Sunningdale Drive TN38 0WB/PRE SITE NOTES.pdf" + +pdfReader = pdfReaderToText(DATA_LOC_1) +doc2 = pdfReader.get_reader() +pdfReader2 = pdfReaderToText(DATA_LOC_2) +doc1 = pdfReader2.get_reader() +vars(doc1) + +def main(): + pass + # POC of downloading each file + south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE) + south_coast_scraper.download_file_for_each_address() + + # POC of pdf reader + + + +if __name__ == "__main__": + main() diff --git a/etl/development.py b/etl/development.py new file mode 100644 index 0000000..f06c74f --- /dev/null +++ b/etl/development.py @@ -0,0 +1,33 @@ +import os +from pdfReader.pdfReaderToText import pdfReaderToText +from etl.scraper.scraper import SharePointScraper, SharePointInstaller +from pprint import pprint, pformat +import logging +from etl.utils.logger import Logger +from etl.validator.validator import DomnaSharePointValidator + +logger = Logger(name="main.py", level=logging.DEBUG).get_logger() +DATA_LOC_1 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/10 Turnberry Close TN38 0WL/PRE SITE NOTES.pdf" +DATA_LOC_2 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/16 Sunningdale Drive TN38 0WB/PRE SITE NOTES.pdf" + +pdfReader = pdfReaderToText(DATA_LOC_1) +doc2 = pdfReader.get_reader() +pdfReader2 = pdfReaderToText(DATA_LOC_2) +doc1 = pdfReader2.get_reader() +vars(doc1) + +def main(): + south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE, development = True) + south_coast_scraper.download_file_for_each_address() + +if __name__ == "__main__": + main() + + +# Current todo list + +# - [] Finish off scraping for section that I need to finish +# - [] Pydantic format for deemed report +# - [] Generate deemed report +# - [] Docker compose to an sql database in docker compose (2 hours, then work on sql) +# - [] Deploy via terraform to aws (1 day) \ No newline at end of file diff --git a/etl/scraper/scraper.py b/etl/scraper/scraper.py index c1fc0fd..cc5e99b 100644 --- a/etl/scraper/scraper.py +++ b/etl/scraper/scraper.py @@ -20,7 +20,7 @@ class SharePointScraper(): A simple scraper to get the contents of a sharepoint and validatate inputs so I can manually change """ - def __init__(self, sharepoint_location): + def __init__(self, sharepoint_location, development=False): self.logger = Logger(name="SharePointScraper", level=logging.DEBUG).get_logger() self.sharepoint_client_id = os.getenv("SHAREPOINT_CLIENT_ID", None) self.sharepoint_client_secret = os.getenv("SHAREPOINT_CLIENT_SECRET", None) @@ -43,9 +43,10 @@ class SharePointScraper(): self.surveyor_work_completed = {} # Delete me for production - self.surveyor_names = ['Abdul Koddus'] - self.surveyor_to_housing_assosications = {"Abdul Koddus":['Southern Housing']} - self.surveyor_to_dates_folder = {'Abdul Koddus': ['W.C. 03.03.2025', 'W.C. 24.02.2025']} + if development: + self.surveyor_names = ['Abdul Koddus'] + self.surveyor_to_housing_assosications = {"Abdul Koddus":['Southern Housing']} + self.surveyor_to_dates_folder = {'Abdul Koddus': ['W.C. 03.03.2025', 'W.C. 24.02.2025']} diff --git a/run_etl.sh b/run_etl.sh index bdbe14d..52f43d8 100644 --- a/run_etl.sh +++ b/run_etl.sh @@ -1 +1 @@ -poetry run python etl/main.py --debug \ No newline at end of file +poetry run python etl/development.py --debug \ No newline at end of file