mirror of
https://github.com/Hestia-Homes/survey-extraction.git
synced 2026-06-30 13:10:56 +00:00
69 lines
No EOL
2.5 KiB
Python
69 lines
No EOL
2.5 KiB
Python
import os
|
|
from pdfReader.pdfReaderToText import pdfReaderToText
|
|
from etl.scraper.scraper import SharePointScraper, SharePointInstaller
|
|
from pprint import pprint, pformat
|
|
import logging
|
|
import tempfile
|
|
from etl.utils.logger import Logger
|
|
import asyncio
|
|
|
|
from etl.validator.validator import DomnaSharePointValidator
|
|
|
|
DATA_LOC = "/workspaces/survey-extraction/data/"
|
|
INTERESTING_FILE_LOC = "/workspaces/survey-extraction/data/first last/Submission 03.03.25/customer/10 Sandbeck Lane DN21 3LZ/PRE SITE NOTES.pdf"
|
|
logger = Logger(name="main.py", level=logging.DEBUG).get_logger()
|
|
|
|
|
|
def main():
|
|
# POC PDF Reader
|
|
# list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test()
|
|
# pprint(list_)
|
|
|
|
#POC Scraper -> This part of the code get ths names of wrong format
|
|
south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
|
|
list_of_names = asyncio.run(south_coast_scraper.list_of_names_that_have_the_wrong_date_format())
|
|
logger.info(pformat(list_of_names))
|
|
|
|
# POC Scraper -> This part of the code gets every variation of housing_assocation names
|
|
# south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
|
|
# list_of_house_ass_names = south_coast_scraper.get_housing_association_names()
|
|
# logger.info(pformat(list_of_house_ass_names))
|
|
|
|
# POC of work completed
|
|
# south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE)
|
|
# surverys_completed = south_coast_scraper.get_number_of_surverys_completed()
|
|
# logger.info(pformat(surverys_completed))
|
|
|
|
|
|
|
|
# def create_temp_file(dict_content):
|
|
# with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt') as temp_file:
|
|
# formatted_content = pformat(dict_content)
|
|
# temp_file.write(formatted_content + "\n")
|
|
# temp_file_path = temp_file.name
|
|
|
|
# logger.info(f"Temporary file created at: {temp_file_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|
|
|
|
# Get and read files
|
|
|
|
# CSV file of the survey number
|
|
|
|
|
|
# Work out productivity metirc (number of address in submission folder, with at least one file included)
|
|
|
|
|
|
# Khalim would like these metrics from the pdf
|
|
# address, uprn, assessor's name validation, current sap rating, current annual emissions. DImension
|
|
|
|
|
|
|
|
# next week mvp
|
|
# Gather file data form sharepoint
|
|
# Start making a scheme or csv with the data we 'care about'
|
|
# {id: uuid, 'surveryor_name': 'khalim', 'address': '123 fake street', 'prorperty_type': 'house', sharepoint_loc}
|
|
# data base of historical data. SQL data |