survey-extraction/etl/main.py

import os
from pdfReader.pdfReaderToText import pdfReaderToText
from etl.scraper.scraper import SharePointScraper, SharePointInstaller
from pprint import pprint, pformat
import logging
from etl.utils.logger import Logger
from etl.validator.validator import DomnaSharePointValidator
logger = Logger(name="main.py", level=logging.DEBUG).get_logger()
DATA_LOC_1 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/10 Turnberry Close TN38 0WL/PRE SITE NOTES.pdf"
DATA_LOC_2 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/16 Sunningdale Drive TN38 0WB/PRE SITE NOTES.pdf"

pdfReader = pdfReaderToText(DATA_LOC_1)
doc2 = pdfReader.get_reader()
pdfReader2 = pdfReaderToText(DATA_LOC_2)
doc1 = pdfReader2.get_reader()


def main():
    pass
    # POC PDF Reader
    # list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test()
    # pprint(list_)

    #POC Scraper -> This part of the code get ths names of wrong format
    # south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
    # south_coast_scraper = SharePointScraper(SharePointInstaller.JJC_SERVICE)
    # list_of_names = south_coast_scraper.list_of_names_that_has_the_wrong_date_format()
    # logger.info(pformat(list_of_names))

    # # POC Scraper -> This part of the code gets every variation of housing_assocation names
    # south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
    # list_of_house_ass_names = south_coast_scraper.get_housing_association_names()
    # logger.info(pformat(list_of_house_ass_names))

    # POC of downloading each file
    # south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
    # south_coast_scraper.download_file_for_each_address()

    # POC of pdf reader


if __name__ == "__main__":
    main()


# Get and read files
# and make a quick verification it was two layers

# CSV file of the survey number


# Work out productivity metirc (number of address in submission folder, with at least one file included)

# Khalim would like these metrics from the pdf
# address, uprn, assessor's name validation, current sap rating, current annual emissions. DImension


# next week mvp
# Gather file data form sharepoint
# Start making a scheme or csv with the data we 'care about'
# {id: uuid, 'surveryor_name': 'khalim', 'address': '123 fake street', 'prorperty_type': 'house', sharepoint_loc}
# data base of historical data. SQL data