survey-extraction/etl/main.py

64 lines
2.4 KiB
Python

import os
from pdfReader.pdfReaderToText import pdfReaderToText
from etl.scraper.scraper import SharePointScraper, SharePointInstaller
from pprint import pprint, pformat
import logging
from etl.utils.logger import Logger
from etl.validator.validator import DomnaSharePointValidator
logger = Logger(name="main.py", level=logging.DEBUG).get_logger()
DATA_LOC_1 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/10 Turnberry Close TN38 0WL/PRE SITE NOTES.pdf"
DATA_LOC_2 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/16 Sunningdale Drive TN38 0WB/PRE SITE NOTES.pdf"
pdfReader = pdfReaderToText(DATA_LOC_1)
doc2 = pdfReader.get_reader()
pdfReader2 = pdfReaderToText(DATA_LOC_2)
doc1 = pdfReader2.get_reader()
def main():
pass
# POC PDF Reader
# list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test()
# pprint(list_)
#POC Scraper -> This part of the code get ths names of wrong format
# south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
# south_coast_scraper = SharePointScraper(SharePointInstaller.JJC_SERVICE)
# list_of_names = south_coast_scraper.list_of_names_that_has_the_wrong_date_format()
# logger.info(pformat(list_of_names))
# # POC Scraper -> This part of the code gets every variation of housing_assocation names
# south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
# list_of_house_ass_names = south_coast_scraper.get_housing_association_names()
# logger.info(pformat(list_of_house_ass_names))
# POC of downloading each file
# south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
# south_coast_scraper.download_file_for_each_address()
# POC of pdf reader
if __name__ == "__main__":
main()
# Get and read files
# and make a quick verification it was two layers
# CSV file of the survey number
# Work out productivity metirc (number of address in submission folder, with at least one file included)
# Khalim would like these metrics from the pdf
# address, uprn, assessor's name validation, current sap rating, current annual emissions. DImension
# next week mvp
# Gather file data form sharepoint
# Start making a scheme or csv with the data we 'care about'
# {id: uuid, 'surveryor_name': 'khalim', 'address': '123 fake street', 'prorperty_type': 'house', sharepoint_loc}
# data base of historical data. SQL data