modulised the functions of work I'm doing

This commit is contained in:
Jun-te Kim 2025-03-13 06:38:46 +00:00
parent 6427b030f8
commit 066601f325
5 changed files with 74 additions and 31 deletions

View file

@ -17,16 +17,15 @@ doc1 = pdfReader2.get_reader()
vars(doc1)
def main():
pass
# POC PDF Reader
# list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test()
# pprint(list_)
#POC Scraper -> This part of the code get ths names of wrong format
# south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
# south_coast_scraper = SharePointScraper(SharePointInstaller.JJC_SERVICE)
# list_of_names = south_coast_scraper.list_of_names_that_has_the_wrong_date_format()
# logger.info(pformat(list_of_names))
POC Scraper -> This part of the code get ths names of wrong format
south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
south_coast_scraper = SharePointScraper(SharePointInstaller.JJC_SERVICE)
list_of_names = south_coast_scraper.list_of_names_that_has_the_wrong_date_format()
logger.info(pformat(list_of_names))
# # POC Scraper -> This part of the code gets every variation of housing_assocation names
# south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
@ -43,23 +42,3 @@ def main():
if __name__ == "__main__":
main()
# Get and read files
# and make a quick verification it was two layers
# CSV file of the survey number
# Work out productivity metirc (number of address in submission folder, with at least one file included)
# Khalim would like these metrics from the pdf
# address, uprn, assessor's name validation, current sap rating, current annual emissions. DImension
# next week mvp
# Gather file data form sharepoint
# Start making a scheme or csv with the data we 'care about'
# {id: uuid, 'surveryor_name': 'khalim', 'address': '123 fake street', 'prorperty_type': 'house', sharepoint_loc}
# data base of historical data. SQL data

View file

@ -0,0 +1,30 @@
import os
from pdfReader.pdfReaderToText import pdfReaderToText
from etl.scraper.scraper import SharePointScraper, SharePointInstaller
from pprint import pprint, pformat
import logging
from etl.utils.logger import Logger
from etl.validator.validator import DomnaSharePointValidator
logger = Logger(name="main.py", level=logging.DEBUG).get_logger()
DATA_LOC_1 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/10 Turnberry Close TN38 0WL/PRE SITE NOTES.pdf"
DATA_LOC_2 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/16 Sunningdale Drive TN38 0WB/PRE SITE NOTES.pdf"
pdfReader = pdfReaderToText(DATA_LOC_1)
doc2 = pdfReader.get_reader()
pdfReader2 = pdfReaderToText(DATA_LOC_2)
doc1 = pdfReader2.get_reader()
vars(doc1)
def main():
pass
# POC of downloading each file
south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
south_coast_scraper.download_file_for_each_address()
# POC of pdf reader
if __name__ == "__main__":
main()

33
etl/development.py Normal file
View file

@ -0,0 +1,33 @@
import os
from pdfReader.pdfReaderToText import pdfReaderToText
from etl.scraper.scraper import SharePointScraper, SharePointInstaller
from pprint import pprint, pformat
import logging
from etl.utils.logger import Logger
from etl.validator.validator import DomnaSharePointValidator
logger = Logger(name="main.py", level=logging.DEBUG).get_logger()
DATA_LOC_1 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/10 Turnberry Close TN38 0WL/PRE SITE NOTES.pdf"
DATA_LOC_2 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/16 Sunningdale Drive TN38 0WB/PRE SITE NOTES.pdf"
pdfReader = pdfReaderToText(DATA_LOC_1)
doc2 = pdfReader.get_reader()
pdfReader2 = pdfReaderToText(DATA_LOC_2)
doc1 = pdfReader2.get_reader()
vars(doc1)
def main():
south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE, development = True)
south_coast_scraper.download_file_for_each_address()
if __name__ == "__main__":
main()
# Current todo list
# - [] Finish off scraping for section that I need to finish
# - [] Pydantic format for deemed report
# - [] Generate deemed report
# - [] Docker compose to an sql database in docker compose (2 hours, then work on sql)
# - [] Deploy via terraform to aws (1 day)

View file

@ -20,7 +20,7 @@ class SharePointScraper():
A simple scraper to get the contents of a sharepoint and validatate inputs so I can manually change
"""
def __init__(self, sharepoint_location):
def __init__(self, sharepoint_location, development=False):
self.logger = Logger(name="SharePointScraper", level=logging.DEBUG).get_logger()
self.sharepoint_client_id = os.getenv("SHAREPOINT_CLIENT_ID", None)
self.sharepoint_client_secret = os.getenv("SHAREPOINT_CLIENT_SECRET", None)
@ -43,9 +43,10 @@ class SharePointScraper():
self.surveyor_work_completed = {}
# Delete me for production
self.surveyor_names = ['Abdul Koddus']
self.surveyor_to_housing_assosications = {"Abdul Koddus":['Southern Housing']}
self.surveyor_to_dates_folder = {'Abdul Koddus': ['W.C. 03.03.2025', 'W.C. 24.02.2025']}
if development:
self.surveyor_names = ['Abdul Koddus']
self.surveyor_to_housing_assosications = {"Abdul Koddus":['Southern Housing']}
self.surveyor_to_dates_folder = {'Abdul Koddus': ['W.C. 03.03.2025', 'W.C. 24.02.2025']}

View file

@ -1 +1 @@
poetry run python etl/main.py --debug
poetry run python etl/development.py --debug