diff --git a/etl/pdfReader/pdfReaderToText.py b/etl/pdfReader/pdfReaderToText.py index 350b31a..9632296 100644 --- a/etl/pdfReader/pdfReaderToText.py +++ b/etl/pdfReader/pdfReaderToText.py @@ -1,7 +1,7 @@ from etl.utils.logger import Logger import logging import pymupdf -from etl.pdfReader.sitenotes import QuidosSiteNotesExtractor +from etl.pdfReader.sitenotes import QuidosSiteNotesExtractor, CSR from etl.pdfReader.reportType import ReportType class pdfReaderToText(): @@ -13,6 +13,7 @@ class pdfReaderToText(): self.text_list = [] self.get_text_from_pdf_file() self.type = None + self.get_file_type() def get_text_from_pdf_file(self): self.logger.debug(f"Extrating text from {self.source_path}") @@ -31,6 +32,14 @@ class pdfReaderToText(): if len(self.text_list) > 1: if "Quidos Ltd using Argyle software BRE approved calculator".lower() in self.text_list[0].lower(): self.type = ReportType.QUIDOS_PRESITE_NOTE + elif "Wall pre - Masonry cavity wall-unĮlled".lower() in self.text_list[0].lower(): + self.type = ReportType.U_VALUE_CALCULATOR_REPORT + elif "Overwriting U-Values for EPRs for ECO4 and GBIS:" in self.text_list[1].lower(): + self.type = ReportType.OVERWRITING_U_VALUE_DECLARATION_FORM + elif "Energy Performance Report" in self.text_list: + self.type = ReportType.ENERGY_PERFORMANCE_REPORT + elif "Chartered Surveyor Report: Recommending Extraction of Defective Cavity Wall Insulation " in self.text_list: + self.type = ReportType.CHARTED_SURVEYOR_REPORT else: pass return self.type @@ -38,5 +47,8 @@ class pdfReaderToText(): def get_reader(self): self.get_file_type() - if self.type.name == ReportType.QUIDOS_SITE_NOTE.name: - return QuidosSiteNotesExtractor(self.text_list) \ No newline at end of file + if self.type.name == ReportType.QUIDOS_PRESITE_NOTE.name: + return QuidosSiteNotesExtractor(self.text_list) + elif self.type == ReportType.CHARTED_SURVEYOR_REPORT: + return CSR(self.text_list) + \ No newline at end of file diff --git a/etl/pdfReader/reportType.py b/etl/pdfReader/reportType.py index 90bd5f4..25c5b22 100644 --- a/etl/pdfReader/reportType.py +++ b/etl/pdfReader/reportType.py @@ -4,4 +4,6 @@ from enum import Enum class ReportType(Enum): QUIDOS_PRESITE_NOTE = 1 CHARTED_SURVEYOR_REPORT = 2 - ENERGY_PERFORMANCE_REPORT = 3 \ No newline at end of file + ENERGY_PERFORMANCE_REPORT = 3 + U_VALUE_CALCULATOR_REPORT = 4 + OVERWRITING_U_VALUE_DECLARATION_FORM = 5 \ No newline at end of file diff --git a/etl/pdfReader/sitenotes.py b/etl/pdfReader/sitenotes.py index 4161395..bbfcfb2 100644 --- a/etl/pdfReader/sitenotes.py +++ b/etl/pdfReader/sitenotes.py @@ -22,13 +22,18 @@ class SiteNotesExtractor(): def get_data_between(self, a, b): return self.raw_data[self.raw_data.index(a):self.raw_data.index(b)] + +class CSR(SiteNotesExtractor): + def __init__(self, data_list): + super().__init__(data_list) + self.type = ReportType.CHARTED_SURVEYOR_REPORT class QuidosSiteNotesExtractor(SiteNotesExtractor): def __init__(self, data_list): super().__init__(data_list) - self.type = ReportType.QUIDOS_SITE_NOTE + self.type = ReportType.QUIDOS_PRESITE_NOTE self.company_information = None self.survey_information = None self.property_description = None diff --git a/etl/scis_invoice.py b/etl/scis_invoice.py index dd91364..1b77b2e 100644 --- a/etl/scis_invoice.py +++ b/etl/scis_invoice.py @@ -2,17 +2,47 @@ from etl.scraper.scraper import SharePointScraper, SharePointInstaller from pprint import pformat from etl.pdfReader.pdfReaderToText import pdfReaderToText from etl.surveyedData.surveryedData import surveyedDataProcessor +import pandas as pd + -def get_type_of_file(path): - pass def main(): + data = { + "Address": [], + "Surveyor's Name": [], + "Type of Work": [], + "Price": [] + } + south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION, development=True) file_paths = south_coast_scraper.download_file_for_each_address() - + + list_of_surveys = [] for eachAddress in file_paths: - survey = surveyedDataProcessor(eachAddress) + for address, files in eachAddress.items(): + list_of_surveys.append(surveyedDataProcessor(address, files)) + break + + + for survey in list_of_surveys: + if survey.pre_site_note: + if survey.csr: + data["Price"].append(500) + data["Type of Work"].append("CAVITY ONLY") + else: + data["Price"].append(1000) + data["Type of Work"].append("REMIDIAL CWI ONLY") + + data["Address"].append(survey.address) + data["Surveyor's Name"].append(survey.pre_site) + + df = pd.DataFrame(data) + + # Save to an Excel file + df.to_excel("survey_data.xlsx", index=False) + + print("Excel file 'survey_data.xlsx' created successfully!") if __name__ == "__main__": diff --git a/etl/scraper/scraper.py b/etl/scraper/scraper.py index 17ad03f..72c4edc 100644 --- a/etl/scraper/scraper.py +++ b/etl/scraper/scraper.py @@ -58,6 +58,9 @@ class SharePointScraper(): self.surveyor_to_housing_assosications = {"Abdul Koddus":['Southern Housing']} self.surveyor_to_dates_folder = {'Abdul Koddus': ['W.C. 03.03.2025', 'W.C. 24.02.2025']} + self.surveyor_names = ['Carl Fitzgerald'] + self.surveyor_to_housing_assosications = {"Carl Fitzgerald":['ACIS']} + self.surveyor_to_dates_folder = {'Carl Fitzgerald': ['W.C. 03.03.2025']} diff --git a/etl/surveyedData/surveryedData.py b/etl/surveyedData/surveryedData.py index a47bfe0..338ee33 100644 --- a/etl/surveyedData/surveryedData.py +++ b/etl/surveyedData/surveryedData.py @@ -1,6 +1,23 @@ +from etl.pdfReader.pdfReaderToText import pdfReaderToText +from etl.pdfReader.reportType import ReportType + class surveyedDataProcessor(): - def __init__(self, address_to_files): - for key, value in address_to_files.items(): - self.address = key - self.files = value - print(f"Address is {self.address}, with all files at location {self.files}") + def __init__(self, address, files): + self.address = address + self.files = files + self.pre_site_note = None + self.csr = None + self.identify_files() + + + def identify_files(self): + for file in self.files: + pdf = pdfReaderToText(file) + print("Junte was here") + print(file) + print(pdf.text_list) + if pdf: + if pdf.type == ReportType.QUIDOS_PRESITE_NOTE: + self.pre_site_note = pdf.get_reader() + elif pdf.type == ReportType.CHARTED_SURVEYOR_REPORT: + self.csr = pdf.get_reader() diff --git a/poetry.lock b/poetry.lock index 90ea6c7..273b7a4 100644 --- a/poetry.lock +++ b/poetry.lock @@ -410,6 +410,18 @@ files = [ dnspython = ">=2.0.0" idna = ">=2.0.0" +[[package]] +name = "et-xmlfile" +version = "2.0.0" +description = "An implementation of lxml.xmlfile for the standard library" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"}, + {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, +] + [[package]] name = "executing" version = "2.2.0" @@ -697,6 +709,21 @@ files = [ {file = "numpy-2.2.3.tar.gz", hash = "sha256:dbdc15f0c81611925f382dfa97b3bd0bc2c1ce19d4fe50482cb0ddc12ba30020"}, ] +[[package]] +name = "openpyxl" +version = "3.1.5" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"}, + {file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"}, +] + +[package.dependencies] +et-xmlfile = "*" + [[package]] name = "packaging" version = "24.2" @@ -1436,4 +1463,4 @@ files = [ [metadata] lock-version = "2.1" python-versions = ">=3.12" -content-hash = "710051703d97e156a540ad08b0815338a4283146f6fca3c0ae89cc4e6dad459a" +content-hash = "7c7fb2198bf2cb04e0af34fa6769280fda46907a2024b8f4c188847962964631" diff --git a/pyproject.toml b/pyproject.toml index 4d3d01b..344a5ab 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,6 +12,7 @@ dependencies = [ "msal (>=1.31.1,<2.0.0)", "pandas (>=2.2.3,<3.0.0)", "pydantic[email] (>=2.10.6,<3.0.0)", + "openpyxl (>=3.1.5,<4.0.0)", ] [tool.poetry]