mirror of
https://github.com/Hestia-Homes/survey-extraction.git
synced 2026-06-08 11:17:29 +00:00
64 lines
2.4 KiB
Python
64 lines
2.4 KiB
Python
import os
|
|
from pdfReader.pdfReaderToText import pdfReaderToText
|
|
from etl.scraper.scraper import SharePointScraper, SharePointInstaller
|
|
from pprint import pprint, pformat
|
|
import logging
|
|
from etl.utils.logger import Logger
|
|
from etl.validator.validator import DomnaSharePointValidator
|
|
logger = Logger(name="main.py", level=logging.DEBUG).get_logger()
|
|
DATA_LOC_1 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/10 Turnberry Close TN38 0WL/PRE SITE NOTES.pdf"
|
|
DATA_LOC_2 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/16 Sunningdale Drive TN38 0WB/PRE SITE NOTES.pdf"
|
|
|
|
pdfReader = pdfReaderToText(DATA_LOC_1)
|
|
doc2 = pdfReader.get_reader()
|
|
pdfReader2 = pdfReaderToText(DATA_LOC_2)
|
|
doc1 = pdfReader2.get_reader()
|
|
|
|
|
|
def main():
|
|
pass
|
|
# POC PDF Reader
|
|
# list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test()
|
|
# pprint(list_)
|
|
|
|
#POC Scraper -> This part of the code get ths names of wrong format
|
|
# south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
|
|
# south_coast_scraper = SharePointScraper(SharePointInstaller.JJC_SERVICE)
|
|
# list_of_names = south_coast_scraper.list_of_names_that_has_the_wrong_date_format()
|
|
# logger.info(pformat(list_of_names))
|
|
|
|
# # POC Scraper -> This part of the code gets every variation of housing_assocation names
|
|
# south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
|
|
# list_of_house_ass_names = south_coast_scraper.get_housing_association_names()
|
|
# logger.info(pformat(list_of_house_ass_names))
|
|
|
|
# POC of downloading each file
|
|
# south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
|
|
# south_coast_scraper.download_file_for_each_address()
|
|
|
|
# POC of pdf reader
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
|
|
# Get and read files
|
|
# and make a quick verification it was two layers
|
|
|
|
# CSV file of the survey number
|
|
|
|
|
|
# Work out productivity metirc (number of address in submission folder, with at least one file included)
|
|
|
|
# Khalim would like these metrics from the pdf
|
|
# address, uprn, assessor's name validation, current sap rating, current annual emissions. DImension
|
|
|
|
|
|
|
|
# next week mvp
|
|
# Gather file data form sharepoint
|
|
# Start making a scheme or csv with the data we 'care about'
|
|
# {id: uuid, 'surveryor_name': 'khalim', 'address': '123 fake street', 'prorperty_type': 'house', sharepoint_loc}
|
|
# data base of historical data. SQL data
|