mirror of
https://github.com/Hestia-Homes/survey-extraction.git
synced 2026-06-30 13:10:56 +00:00
time to find out if it works for all
This commit is contained in:
parent
32fb397ca1
commit
9f9abe2280
8 changed files with 112 additions and 15 deletions
|
|
@ -1,7 +1,7 @@
|
|||
from etl.utils.logger import Logger
|
||||
import logging
|
||||
import pymupdf
|
||||
from etl.pdfReader.sitenotes import QuidosSiteNotesExtractor
|
||||
from etl.pdfReader.sitenotes import QuidosSiteNotesExtractor, CSR
|
||||
from etl.pdfReader.reportType import ReportType
|
||||
|
||||
class pdfReaderToText():
|
||||
|
|
@ -13,6 +13,7 @@ class pdfReaderToText():
|
|||
self.text_list = []
|
||||
self.get_text_from_pdf_file()
|
||||
self.type = None
|
||||
self.get_file_type()
|
||||
|
||||
def get_text_from_pdf_file(self):
|
||||
self.logger.debug(f"Extrating text from {self.source_path}")
|
||||
|
|
@ -31,6 +32,14 @@ class pdfReaderToText():
|
|||
if len(self.text_list) > 1:
|
||||
if "Quidos Ltd using Argyle software BRE approved calculator".lower() in self.text_list[0].lower():
|
||||
self.type = ReportType.QUIDOS_PRESITE_NOTE
|
||||
elif "Wall pre - Masonry cavity wall-unĮlled".lower() in self.text_list[0].lower():
|
||||
self.type = ReportType.U_VALUE_CALCULATOR_REPORT
|
||||
elif "Overwriting U-Values for EPRs for ECO4 and GBIS:" in self.text_list[1].lower():
|
||||
self.type = ReportType.OVERWRITING_U_VALUE_DECLARATION_FORM
|
||||
elif "Energy Performance Report" in self.text_list:
|
||||
self.type = ReportType.ENERGY_PERFORMANCE_REPORT
|
||||
elif "Chartered Surveyor Report: Recommending Extraction of Defective Cavity Wall Insulation " in self.text_list:
|
||||
self.type = ReportType.CHARTED_SURVEYOR_REPORT
|
||||
else:
|
||||
pass
|
||||
return self.type
|
||||
|
|
@ -38,5 +47,8 @@ class pdfReaderToText():
|
|||
def get_reader(self):
|
||||
self.get_file_type()
|
||||
|
||||
if self.type.name == ReportType.QUIDOS_SITE_NOTE.name:
|
||||
return QuidosSiteNotesExtractor(self.text_list)
|
||||
if self.type.name == ReportType.QUIDOS_PRESITE_NOTE.name:
|
||||
return QuidosSiteNotesExtractor(self.text_list)
|
||||
elif self.type == ReportType.CHARTED_SURVEYOR_REPORT:
|
||||
return CSR(self.text_list)
|
||||
|
||||
|
|
@ -4,4 +4,6 @@ from enum import Enum
|
|||
class ReportType(Enum):
|
||||
QUIDOS_PRESITE_NOTE = 1
|
||||
CHARTED_SURVEYOR_REPORT = 2
|
||||
ENERGY_PERFORMANCE_REPORT = 3
|
||||
ENERGY_PERFORMANCE_REPORT = 3
|
||||
U_VALUE_CALCULATOR_REPORT = 4
|
||||
OVERWRITING_U_VALUE_DECLARATION_FORM = 5
|
||||
|
|
@ -22,13 +22,18 @@ class SiteNotesExtractor():
|
|||
|
||||
def get_data_between(self, a, b):
|
||||
return self.raw_data[self.raw_data.index(a):self.raw_data.index(b)]
|
||||
|
||||
class CSR(SiteNotesExtractor):
|
||||
def __init__(self, data_list):
|
||||
super().__init__(data_list)
|
||||
self.type = ReportType.CHARTED_SURVEYOR_REPORT
|
||||
|
||||
|
||||
|
||||
class QuidosSiteNotesExtractor(SiteNotesExtractor):
|
||||
def __init__(self, data_list):
|
||||
super().__init__(data_list)
|
||||
self.type = ReportType.QUIDOS_SITE_NOTE
|
||||
self.type = ReportType.QUIDOS_PRESITE_NOTE
|
||||
self.company_information = None
|
||||
self.survey_information = None
|
||||
self.property_description = None
|
||||
|
|
|
|||
|
|
@ -2,17 +2,47 @@ from etl.scraper.scraper import SharePointScraper, SharePointInstaller
|
|||
from pprint import pformat
|
||||
from etl.pdfReader.pdfReaderToText import pdfReaderToText
|
||||
from etl.surveyedData.surveryedData import surveyedDataProcessor
|
||||
import pandas as pd
|
||||
|
||||
|
||||
|
||||
def get_type_of_file(path):
|
||||
pass
|
||||
|
||||
def main():
|
||||
data = {
|
||||
"Address": [],
|
||||
"Surveyor's Name": [],
|
||||
"Type of Work": [],
|
||||
"Price": []
|
||||
}
|
||||
|
||||
south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION, development=True)
|
||||
file_paths = south_coast_scraper.download_file_for_each_address()
|
||||
|
||||
|
||||
list_of_surveys = []
|
||||
for eachAddress in file_paths:
|
||||
survey = surveyedDataProcessor(eachAddress)
|
||||
for address, files in eachAddress.items():
|
||||
list_of_surveys.append(surveyedDataProcessor(address, files))
|
||||
break
|
||||
|
||||
|
||||
for survey in list_of_surveys:
|
||||
if survey.pre_site_note:
|
||||
if survey.csr:
|
||||
data["Price"].append(500)
|
||||
data["Type of Work"].append("CAVITY ONLY")
|
||||
else:
|
||||
data["Price"].append(1000)
|
||||
data["Type of Work"].append("REMIDIAL CWI ONLY")
|
||||
|
||||
data["Address"].append(survey.address)
|
||||
data["Surveyor's Name"].append(survey.pre_site)
|
||||
|
||||
df = pd.DataFrame(data)
|
||||
|
||||
# Save to an Excel file
|
||||
df.to_excel("survey_data.xlsx", index=False)
|
||||
|
||||
print("Excel file 'survey_data.xlsx' created successfully!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -58,6 +58,9 @@ class SharePointScraper():
|
|||
self.surveyor_to_housing_assosications = {"Abdul Koddus":['Southern Housing']}
|
||||
self.surveyor_to_dates_folder = {'Abdul Koddus': ['W.C. 03.03.2025', 'W.C. 24.02.2025']}
|
||||
|
||||
self.surveyor_names = ['Carl Fitzgerald']
|
||||
self.surveyor_to_housing_assosications = {"Carl Fitzgerald":['ACIS']}
|
||||
self.surveyor_to_dates_folder = {'Carl Fitzgerald': ['W.C. 03.03.2025']}
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,23 @@
|
|||
from etl.pdfReader.pdfReaderToText import pdfReaderToText
|
||||
from etl.pdfReader.reportType import ReportType
|
||||
|
||||
class surveyedDataProcessor():
|
||||
def __init__(self, address_to_files):
|
||||
for key, value in address_to_files.items():
|
||||
self.address = key
|
||||
self.files = value
|
||||
print(f"Address is {self.address}, with all files at location {self.files}")
|
||||
def __init__(self, address, files):
|
||||
self.address = address
|
||||
self.files = files
|
||||
self.pre_site_note = None
|
||||
self.csr = None
|
||||
self.identify_files()
|
||||
|
||||
|
||||
def identify_files(self):
|
||||
for file in self.files:
|
||||
pdf = pdfReaderToText(file)
|
||||
print("Junte was here")
|
||||
print(file)
|
||||
print(pdf.text_list)
|
||||
if pdf:
|
||||
if pdf.type == ReportType.QUIDOS_PRESITE_NOTE:
|
||||
self.pre_site_note = pdf.get_reader()
|
||||
elif pdf.type == ReportType.CHARTED_SURVEYOR_REPORT:
|
||||
self.csr = pdf.get_reader()
|
||||
|
|
|
|||
29
poetry.lock
generated
29
poetry.lock
generated
|
|
@ -410,6 +410,18 @@ files = [
|
|||
dnspython = ">=2.0.0"
|
||||
idna = ">=2.0.0"
|
||||
|
||||
[[package]]
|
||||
name = "et-xmlfile"
|
||||
version = "2.0.0"
|
||||
description = "An implementation of lxml.xmlfile for the standard library"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"},
|
||||
{file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "executing"
|
||||
version = "2.2.0"
|
||||
|
|
@ -697,6 +709,21 @@ files = [
|
|||
{file = "numpy-2.2.3.tar.gz", hash = "sha256:dbdc15f0c81611925f382dfa97b3bd0bc2c1ce19d4fe50482cb0ddc12ba30020"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openpyxl"
|
||||
version = "3.1.5"
|
||||
description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
groups = ["main"]
|
||||
files = [
|
||||
{file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"},
|
||||
{file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
et-xmlfile = "*"
|
||||
|
||||
[[package]]
|
||||
name = "packaging"
|
||||
version = "24.2"
|
||||
|
|
@ -1436,4 +1463,4 @@ files = [
|
|||
[metadata]
|
||||
lock-version = "2.1"
|
||||
python-versions = ">=3.12"
|
||||
content-hash = "710051703d97e156a540ad08b0815338a4283146f6fca3c0ae89cc4e6dad459a"
|
||||
content-hash = "7c7fb2198bf2cb04e0af34fa6769280fda46907a2024b8f4c188847962964631"
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ dependencies = [
|
|||
"msal (>=1.31.1,<2.0.0)",
|
||||
"pandas (>=2.2.3,<3.0.0)",
|
||||
"pydantic[email] (>=2.10.6,<3.0.0)",
|
||||
"openpyxl (>=3.1.5,<4.0.0)",
|
||||
]
|
||||
|
||||
[tool.poetry]
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue