mirror of
https://github.com/Hestia-Homes/survey-extraction.git
synced 2026-06-30 13:10:56 +00:00
modulised the functions of work I'm doing
This commit is contained in:
parent
6427b030f8
commit
066601f325
5 changed files with 74 additions and 31 deletions
|
|
@ -17,16 +17,15 @@ doc1 = pdfReader2.get_reader()
|
|||
vars(doc1)
|
||||
|
||||
def main():
|
||||
pass
|
||||
# POC PDF Reader
|
||||
# list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test()
|
||||
# pprint(list_)
|
||||
|
||||
#POC Scraper -> This part of the code get ths names of wrong format
|
||||
# south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
|
||||
# south_coast_scraper = SharePointScraper(SharePointInstaller.JJC_SERVICE)
|
||||
# list_of_names = south_coast_scraper.list_of_names_that_has_the_wrong_date_format()
|
||||
# logger.info(pformat(list_of_names))
|
||||
POC Scraper -> This part of the code get ths names of wrong format
|
||||
south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
|
||||
south_coast_scraper = SharePointScraper(SharePointInstaller.JJC_SERVICE)
|
||||
list_of_names = south_coast_scraper.list_of_names_that_has_the_wrong_date_format()
|
||||
logger.info(pformat(list_of_names))
|
||||
|
||||
# # POC Scraper -> This part of the code gets every variation of housing_assocation names
|
||||
# south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
|
||||
|
|
@ -43,23 +42,3 @@ def main():
|
|||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
# Get and read files
|
||||
# and make a quick verification it was two layers
|
||||
|
||||
# CSV file of the survey number
|
||||
|
||||
|
||||
# Work out productivity metirc (number of address in submission folder, with at least one file included)
|
||||
|
||||
# Khalim would like these metrics from the pdf
|
||||
# address, uprn, assessor's name validation, current sap rating, current annual emissions. DImension
|
||||
|
||||
|
||||
|
||||
# next week mvp
|
||||
# Gather file data form sharepoint
|
||||
# Start making a scheme or csv with the data we 'care about'
|
||||
# {id: uuid, 'surveryor_name': 'khalim', 'address': '123 fake street', 'prorperty_type': 'house', sharepoint_loc}
|
||||
# data base of historical data. SQL data
|
||||
30
etl/deemed_report_generator.py
Normal file
30
etl/deemed_report_generator.py
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
import os
|
||||
from pdfReader.pdfReaderToText import pdfReaderToText
|
||||
from etl.scraper.scraper import SharePointScraper, SharePointInstaller
|
||||
from pprint import pprint, pformat
|
||||
import logging
|
||||
from etl.utils.logger import Logger
|
||||
from etl.validator.validator import DomnaSharePointValidator
|
||||
|
||||
logger = Logger(name="main.py", level=logging.DEBUG).get_logger()
|
||||
DATA_LOC_1 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/10 Turnberry Close TN38 0WL/PRE SITE NOTES.pdf"
|
||||
DATA_LOC_2 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/16 Sunningdale Drive TN38 0WB/PRE SITE NOTES.pdf"
|
||||
|
||||
pdfReader = pdfReaderToText(DATA_LOC_1)
|
||||
doc2 = pdfReader.get_reader()
|
||||
pdfReader2 = pdfReaderToText(DATA_LOC_2)
|
||||
doc1 = pdfReader2.get_reader()
|
||||
vars(doc1)
|
||||
|
||||
def main():
|
||||
pass
|
||||
# POC of downloading each file
|
||||
south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
|
||||
south_coast_scraper.download_file_for_each_address()
|
||||
|
||||
# POC of pdf reader
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
33
etl/development.py
Normal file
33
etl/development.py
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
import os
|
||||
from pdfReader.pdfReaderToText import pdfReaderToText
|
||||
from etl.scraper.scraper import SharePointScraper, SharePointInstaller
|
||||
from pprint import pprint, pformat
|
||||
import logging
|
||||
from etl.utils.logger import Logger
|
||||
from etl.validator.validator import DomnaSharePointValidator
|
||||
|
||||
logger = Logger(name="main.py", level=logging.DEBUG).get_logger()
|
||||
DATA_LOC_1 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/10 Turnberry Close TN38 0WL/PRE SITE NOTES.pdf"
|
||||
DATA_LOC_2 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/16 Sunningdale Drive TN38 0WB/PRE SITE NOTES.pdf"
|
||||
|
||||
pdfReader = pdfReaderToText(DATA_LOC_1)
|
||||
doc2 = pdfReader.get_reader()
|
||||
pdfReader2 = pdfReaderToText(DATA_LOC_2)
|
||||
doc1 = pdfReader2.get_reader()
|
||||
vars(doc1)
|
||||
|
||||
def main():
|
||||
south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE, development = True)
|
||||
south_coast_scraper.download_file_for_each_address()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
# Current todo list
|
||||
|
||||
# - [] Finish off scraping for section that I need to finish
|
||||
# - [] Pydantic format for deemed report
|
||||
# - [] Generate deemed report
|
||||
# - [] Docker compose to an sql database in docker compose (2 hours, then work on sql)
|
||||
# - [] Deploy via terraform to aws (1 day)
|
||||
|
|
@ -20,7 +20,7 @@ class SharePointScraper():
|
|||
A simple scraper to get the contents of a sharepoint and validatate inputs so I can manually change
|
||||
"""
|
||||
|
||||
def __init__(self, sharepoint_location):
|
||||
def __init__(self, sharepoint_location, development=False):
|
||||
self.logger = Logger(name="SharePointScraper", level=logging.DEBUG).get_logger()
|
||||
self.sharepoint_client_id = os.getenv("SHAREPOINT_CLIENT_ID", None)
|
||||
self.sharepoint_client_secret = os.getenv("SHAREPOINT_CLIENT_SECRET", None)
|
||||
|
|
@ -43,9 +43,10 @@ class SharePointScraper():
|
|||
self.surveyor_work_completed = {}
|
||||
|
||||
# Delete me for production
|
||||
self.surveyor_names = ['Abdul Koddus']
|
||||
self.surveyor_to_housing_assosications = {"Abdul Koddus":['Southern Housing']}
|
||||
self.surveyor_to_dates_folder = {'Abdul Koddus': ['W.C. 03.03.2025', 'W.C. 24.02.2025']}
|
||||
if development:
|
||||
self.surveyor_names = ['Abdul Koddus']
|
||||
self.surveyor_to_housing_assosications = {"Abdul Koddus":['Southern Housing']}
|
||||
self.surveyor_to_dates_folder = {'Abdul Koddus': ['W.C. 03.03.2025', 'W.C. 24.02.2025']}
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
poetry run python etl/main.py --debug
|
||||
poetry run python etl/development.py --debug
|
||||
Loading…
Add table
Reference in a new issue