time to find out if it works for all

2026-06-30 13:10:56 +00:00 · 2025-03-19 14:20:26 +00:00 · 2025-03-19 14:20:26 +00:00 · 9f9abe2280
commit 9f9abe2280
parent 32fb397ca1
8 changed files with 112 additions and 15 deletions
--- a/etl/pdfReader/pdfReaderToText.py
+++ b/etl/pdfReader/pdfReaderToText.py
@ -1,7 +1,7 @@
 from etl.utils.logger import Logger
 import logging
 import pymupdf
-from etl.pdfReader.sitenotes import QuidosSiteNotesExtractor
+from etl.pdfReader.sitenotes import QuidosSiteNotesExtractor, CSR
 from etl.pdfReader.reportType import ReportType

 class pdfReaderToText():
@ -13,6 +13,7 @@ class pdfReaderToText():
        self.text_list = []
        self.get_text_from_pdf_file()
        self.type = None
+        self.get_file_type()

    def get_text_from_pdf_file(self):
        self.logger.debug(f"Extrating text from {self.source_path}")
@ -31,6 +32,14 @@ class pdfReaderToText():
        if len(self.text_list) > 1:
            if  "Quidos Ltd using Argyle software BRE approved calculator".lower() in self.text_list[0].lower():
                self.type = ReportType.QUIDOS_PRESITE_NOTE
+            elif "Wall pre - Masonry cavity wall-unĮlled".lower() in self.text_list[0].lower():
+                self.type = ReportType.U_VALUE_CALCULATOR_REPORT
+            elif "Overwriting U-Values for EPRs for ECO4 and GBIS:" in self.text_list[1].lower():
+                self.type = ReportType.OVERWRITING_U_VALUE_DECLARATION_FORM
+            elif "Energy Performance Report" in self.text_list:
+                self.type = ReportType.ENERGY_PERFORMANCE_REPORT
+            elif "Chartered Surveyor Report: Recommending Extraction of Defective Cavity Wall Insulation " in self.text_list:
+                self.type = ReportType.CHARTED_SURVEYOR_REPORT
            else:
                pass
            return self.type
@ -38,5 +47,8 @@ class pdfReaderToText():
    def get_reader(self):
        self.get_file_type()

-        if self.type.name == ReportType.QUIDOS_SITE_NOTE.name:
-            return QuidosSiteNotesExtractor(self.text_list)
+        if self.type.name == ReportType.QUIDOS_PRESITE_NOTE.name:
+            return QuidosSiteNotesExtractor(self.text_list)
+        elif self.type == ReportType.CHARTED_SURVEYOR_REPORT:
+            return CSR(self.text_list)
+        
--- a/etl/pdfReader/reportType.py
+++ b/etl/pdfReader/reportType.py
@ -4,4 +4,6 @@ from enum import Enum
 class ReportType(Enum):
    QUIDOS_PRESITE_NOTE = 1
    CHARTED_SURVEYOR_REPORT = 2
-    ENERGY_PERFORMANCE_REPORT = 3
+    ENERGY_PERFORMANCE_REPORT = 3
+    U_VALUE_CALCULATOR_REPORT = 4
+    OVERWRITING_U_VALUE_DECLARATION_FORM = 5
--- a/etl/pdfReader/sitenotes.py
+++ b/etl/pdfReader/sitenotes.py
@ -22,13 +22,18 @@ class SiteNotesExtractor():
        
    def get_data_between(self, a, b):
        return self.raw_data[self.raw_data.index(a):self.raw_data.index(b)]
+    
+class CSR(SiteNotesExtractor):
+    def __init__(self, data_list):
+        super().__init__(data_list)
+        self.type = ReportType.CHARTED_SURVEYOR_REPORT
        


 class QuidosSiteNotesExtractor(SiteNotesExtractor):
    def __init__(self, data_list):
        super().__init__(data_list) 
-        self.type = ReportType.QUIDOS_SITE_NOTE
+        self.type = ReportType.QUIDOS_PRESITE_NOTE
        self.company_information = None
        self.survey_information = None
        self.property_description = None
--- a/etl/scis_invoice.py
+++ b/etl/scis_invoice.py
@ -2,17 +2,47 @@ from etl.scraper.scraper import SharePointScraper, SharePointInstaller
 from pprint import pformat
 from etl.pdfReader.pdfReaderToText import pdfReaderToText
 from etl.surveyedData.surveryedData import surveyedDataProcessor
+import pandas as pd
+


-def get_type_of_file(path):
-    pass

 def main():
+    data = {
+         "Address": [],
+         "Surveyor's Name": [],
+         "Type of Work": [],
+         "Price": []
+    }
+
    south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION, development=True)
    file_paths = south_coast_scraper.download_file_for_each_address()
-
+    
+    list_of_surveys = []
    for eachAddress in file_paths:
-            survey = surveyedDataProcessor(eachAddress)
+            for address, files in eachAddress.items():
+                list_of_surveys.append(surveyedDataProcessor(address, files))
+                break
+
+
+    for survey in list_of_surveys:
+        if survey.pre_site_note:
+            if survey.csr:
+                data["Price"].append(500)
+                data["Type of Work"].append("CAVITY ONLY")
+            else:
+                data["Price"].append(1000)
+                data["Type of Work"].append("REMIDIAL CWI ONLY")
+
+            data["Address"].append(survey.address)
+            data["Surveyor's Name"].append(survey.pre_site)
+
+    df = pd.DataFrame(data)
+
+    # Save to an Excel file
+    df.to_excel("survey_data.xlsx", index=False)
+
+    print("Excel file 'survey_data.xlsx' created successfully!")


 if __name__ == "__main__":
--- a/etl/scraper/scraper.py
+++ b/etl/scraper/scraper.py
@ -58,6 +58,9 @@ class SharePointScraper():
            self.surveyor_to_housing_assosications = {"Abdul Koddus":['Southern Housing']}
            self.surveyor_to_dates_folder = {'Abdul Koddus': ['W.C. 03.03.2025', 'W.C. 24.02.2025']}

+            self.surveyor_names = ['Carl Fitzgerald']
+            self.surveyor_to_housing_assosications = {"Carl Fitzgerald":['ACIS']}
+            self.surveyor_to_dates_folder = {'Carl Fitzgerald': ['W.C. 03.03.2025']}



--- a/etl/surveyedData/surveryedData.py
+++ b/etl/surveyedData/surveryedData.py
@ -1,6 +1,23 @@
+from etl.pdfReader.pdfReaderToText import pdfReaderToText
+from etl.pdfReader.reportType import ReportType
+
 class surveyedDataProcessor():
-    def __init__(self, address_to_files):
-        for key, value in address_to_files.items():
-            self.address = key
-            self.files = value
-            print(f"Address is {self.address}, with all files at location {self.files}")
+    def __init__(self, address, files):
+        self.address = address
+        self.files = files
+        self.pre_site_note = None
+        self.csr = None
+        self.identify_files()
+
+
+    def identify_files(self):
+        for file in self.files:
+            pdf = pdfReaderToText(file)
+            print("Junte was here")
+            print(file)
+            print(pdf.text_list)
+            if pdf:
+                if pdf.type == ReportType.QUIDOS_PRESITE_NOTE:
+                    self.pre_site_note = pdf.get_reader()
+                elif pdf.type == ReportType.CHARTED_SURVEYOR_REPORT:
+                    self.csr = pdf.get_reader()
--- a/poetry.lock
+++ b/poetry.lock
@ -410,6 +410,18 @@ files = [
 dnspython = ">=2.0.0"
 idna = ">=2.0.0"

+[[package]]
+name = "et-xmlfile"
+version = "2.0.0"
+description = "An implementation of lxml.xmlfile for the standard library"
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"},
+    {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"},
+]
+
 [[package]]
 name = "executing"
 version = "2.2.0"
@ -697,6 +709,21 @@ files = [
    {file = "numpy-2.2.3.tar.gz", hash = "sha256:dbdc15f0c81611925f382dfa97b3bd0bc2c1ce19d4fe50482cb0ddc12ba30020"},
 ]

+[[package]]
+name = "openpyxl"
+version = "3.1.5"
+description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"},
+    {file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"},
+]
+
+[package.dependencies]
+et-xmlfile = "*"
+
 [[package]]
 name = "packaging"
 version = "24.2"
@ -1436,4 +1463,4 @@ files = [
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.12"
-content-hash = "710051703d97e156a540ad08b0815338a4283146f6fca3c0ae89cc4e6dad459a"
+content-hash = "7c7fb2198bf2cb04e0af34fa6769280fda46907a2024b8f4c188847962964631"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -12,6 +12,7 @@ dependencies = [
    "msal (>=1.31.1,<2.0.0)",
    "pandas (>=2.2.3,<3.0.0)",
    "pydantic[email] (>=2.10.6,<3.0.0)",
+    "openpyxl (>=3.1.5,<4.0.0)",
 ]

 [tool.poetry]