time to find out if it works for all

This commit is contained in:
Jun-te Kim 2025-03-19 14:20:26 +00:00
parent 32fb397ca1
commit 9f9abe2280
8 changed files with 112 additions and 15 deletions

View file

@ -1,7 +1,7 @@
from etl.utils.logger import Logger
import logging
import pymupdf
from etl.pdfReader.sitenotes import QuidosSiteNotesExtractor
from etl.pdfReader.sitenotes import QuidosSiteNotesExtractor, CSR
from etl.pdfReader.reportType import ReportType
class pdfReaderToText():
@ -13,6 +13,7 @@ class pdfReaderToText():
self.text_list = []
self.get_text_from_pdf_file()
self.type = None
self.get_file_type()
def get_text_from_pdf_file(self):
self.logger.debug(f"Extrating text from {self.source_path}")
@ -31,6 +32,14 @@ class pdfReaderToText():
if len(self.text_list) > 1:
if "Quidos Ltd using Argyle software BRE approved calculator".lower() in self.text_list[0].lower():
self.type = ReportType.QUIDOS_PRESITE_NOTE
elif "Wall pre - Masonry cavity wall-unĮlled".lower() in self.text_list[0].lower():
self.type = ReportType.U_VALUE_CALCULATOR_REPORT
elif "Overwriting U-Values for EPRs for ECO4 and GBIS:" in self.text_list[1].lower():
self.type = ReportType.OVERWRITING_U_VALUE_DECLARATION_FORM
elif "Energy Performance Report" in self.text_list:
self.type = ReportType.ENERGY_PERFORMANCE_REPORT
elif "Chartered Surveyor Report: Recommending Extraction of Defective Cavity Wall Insulation " in self.text_list:
self.type = ReportType.CHARTED_SURVEYOR_REPORT
else:
pass
return self.type
@ -38,5 +47,8 @@ class pdfReaderToText():
def get_reader(self):
self.get_file_type()
if self.type.name == ReportType.QUIDOS_SITE_NOTE.name:
return QuidosSiteNotesExtractor(self.text_list)
if self.type.name == ReportType.QUIDOS_PRESITE_NOTE.name:
return QuidosSiteNotesExtractor(self.text_list)
elif self.type == ReportType.CHARTED_SURVEYOR_REPORT:
return CSR(self.text_list)

View file

@ -4,4 +4,6 @@ from enum import Enum
class ReportType(Enum):
QUIDOS_PRESITE_NOTE = 1
CHARTED_SURVEYOR_REPORT = 2
ENERGY_PERFORMANCE_REPORT = 3
ENERGY_PERFORMANCE_REPORT = 3
U_VALUE_CALCULATOR_REPORT = 4
OVERWRITING_U_VALUE_DECLARATION_FORM = 5

View file

@ -22,13 +22,18 @@ class SiteNotesExtractor():
def get_data_between(self, a, b):
return self.raw_data[self.raw_data.index(a):self.raw_data.index(b)]
class CSR(SiteNotesExtractor):
def __init__(self, data_list):
super().__init__(data_list)
self.type = ReportType.CHARTED_SURVEYOR_REPORT
class QuidosSiteNotesExtractor(SiteNotesExtractor):
def __init__(self, data_list):
super().__init__(data_list)
self.type = ReportType.QUIDOS_SITE_NOTE
self.type = ReportType.QUIDOS_PRESITE_NOTE
self.company_information = None
self.survey_information = None
self.property_description = None

View file

@ -2,17 +2,47 @@ from etl.scraper.scraper import SharePointScraper, SharePointInstaller
from pprint import pformat
from etl.pdfReader.pdfReaderToText import pdfReaderToText
from etl.surveyedData.surveryedData import surveyedDataProcessor
import pandas as pd
def get_type_of_file(path):
pass
def main():
data = {
"Address": [],
"Surveyor's Name": [],
"Type of Work": [],
"Price": []
}
south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION, development=True)
file_paths = south_coast_scraper.download_file_for_each_address()
list_of_surveys = []
for eachAddress in file_paths:
survey = surveyedDataProcessor(eachAddress)
for address, files in eachAddress.items():
list_of_surveys.append(surveyedDataProcessor(address, files))
break
for survey in list_of_surveys:
if survey.pre_site_note:
if survey.csr:
data["Price"].append(500)
data["Type of Work"].append("CAVITY ONLY")
else:
data["Price"].append(1000)
data["Type of Work"].append("REMIDIAL CWI ONLY")
data["Address"].append(survey.address)
data["Surveyor's Name"].append(survey.pre_site)
df = pd.DataFrame(data)
# Save to an Excel file
df.to_excel("survey_data.xlsx", index=False)
print("Excel file 'survey_data.xlsx' created successfully!")
if __name__ == "__main__":

View file

@ -58,6 +58,9 @@ class SharePointScraper():
self.surveyor_to_housing_assosications = {"Abdul Koddus":['Southern Housing']}
self.surveyor_to_dates_folder = {'Abdul Koddus': ['W.C. 03.03.2025', 'W.C. 24.02.2025']}
self.surveyor_names = ['Carl Fitzgerald']
self.surveyor_to_housing_assosications = {"Carl Fitzgerald":['ACIS']}
self.surveyor_to_dates_folder = {'Carl Fitzgerald': ['W.C. 03.03.2025']}

View file

@ -1,6 +1,23 @@
from etl.pdfReader.pdfReaderToText import pdfReaderToText
from etl.pdfReader.reportType import ReportType
class surveyedDataProcessor():
def __init__(self, address_to_files):
for key, value in address_to_files.items():
self.address = key
self.files = value
print(f"Address is {self.address}, with all files at location {self.files}")
def __init__(self, address, files):
self.address = address
self.files = files
self.pre_site_note = None
self.csr = None
self.identify_files()
def identify_files(self):
for file in self.files:
pdf = pdfReaderToText(file)
print("Junte was here")
print(file)
print(pdf.text_list)
if pdf:
if pdf.type == ReportType.QUIDOS_PRESITE_NOTE:
self.pre_site_note = pdf.get_reader()
elif pdf.type == ReportType.CHARTED_SURVEYOR_REPORT:
self.csr = pdf.get_reader()

29
poetry.lock generated
View file

@ -410,6 +410,18 @@ files = [
dnspython = ">=2.0.0"
idna = ">=2.0.0"
[[package]]
name = "et-xmlfile"
version = "2.0.0"
description = "An implementation of lxml.xmlfile for the standard library"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"},
{file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"},
]
[[package]]
name = "executing"
version = "2.2.0"
@ -697,6 +709,21 @@ files = [
{file = "numpy-2.2.3.tar.gz", hash = "sha256:dbdc15f0c81611925f382dfa97b3bd0bc2c1ce19d4fe50482cb0ddc12ba30020"},
]
[[package]]
name = "openpyxl"
version = "3.1.5"
description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
optional = false
python-versions = ">=3.8"
groups = ["main"]
files = [
{file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"},
{file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"},
]
[package.dependencies]
et-xmlfile = "*"
[[package]]
name = "packaging"
version = "24.2"
@ -1436,4 +1463,4 @@ files = [
[metadata]
lock-version = "2.1"
python-versions = ">=3.12"
content-hash = "710051703d97e156a540ad08b0815338a4283146f6fca3c0ae89cc4e6dad459a"
content-hash = "7c7fb2198bf2cb04e0af34fa6769280fda46907a2024b8f4c188847962964631"

View file

@ -12,6 +12,7 @@ dependencies = [
"msal (>=1.31.1,<2.0.0)",
"pandas (>=2.2.3,<3.0.0)",
"pydantic[email] (>=2.10.6,<3.0.0)",
"openpyxl (>=3.1.5,<4.0.0)",
]
[tool.poetry]