survey-extraction/etl/main.py

import os
from pdfReader.pdfReaderToText import pdfReaderToText
from etl.scraper.scraper import SharePointScraper, SharePointInstaller
from pprint import pprint, pformat
import logging
import tempfile
from etl.utils.logger import Logger
import asyncio

from etl.validator.validator import DomnaSharePointValidator

DATA_LOC = "/workspaces/survey-extraction/data/"
INTERESTING_FILE_LOC = "/workspaces/survey-extraction/data/first last/Submission 03.03.25/customer/10 Sandbeck Lane DN21 3LZ/PRE SITE NOTES.pdf"
logger = Logger(name="main.py", level=logging.DEBUG).get_logger()


def main():
    # POC PDF Reader
    # list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test()
    # pprint(list_)

    #POC Scraper -> This part of the code get ths names of wrong format
    south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
    list_of_names = asyncio.run(south_coast_scraper.list_of_names_that_have_the_wrong_date_format())
    logger.info(pformat(list_of_names))

    # POC Scraper -> This part of the code gets every variation of housing_assocation names
    # south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
    # list_of_house_ass_names = south_coast_scraper.get_housing_association_names()
    # logger.info(pformat(list_of_house_ass_names))

    # POC of work completed
    # south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
    # surverys_completed = south_coast_scraper.get_number_of_surverys_completed()
    # logger.info(pformat(surverys_completed))


# def create_temp_file(dict_content):
#     with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt') as temp_file:
#         formatted_content = pformat(dict_content)
#         temp_file.write(formatted_content + "\n")
#         temp_file_path = temp_file.name

#     logger.info(f"Temporary file created at: {temp_file_path}")


if __name__ == "__main__":
    main()


# Get and read files

# CSV file of the survey number


# Work out productivity metirc (number of address in submission folder, with at least one file included)


# Khalim would like these metrics from the pdf
# address, uprn, assessor's name validation, current sap rating, current annual emissions. DImension


# next week mvp
# Gather file data form sharepoint
# Start making a scheme or csv with the data we 'care about'
# {id: uuid, 'surveryor_name': 'khalim', 'address': '123 fake street', 'prorperty_type': 'house', sharepoint_loc}
# data base of historical data. SQL data