From 5059bc28e9dfbac425dc92883b5fc406fb5fe1cb Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Tue, 11 Mar 2025 12:19:25 +0000 Subject: [PATCH] pdf reader work --- etl/main.py | 18 ++++++------ etl/pdfReader/sitenotes.py | 57 +++++++++++++++++++++++++++++++++++++- 2 files changed, 65 insertions(+), 10 deletions(-) diff --git a/etl/main.py b/etl/main.py index 08c163e..a234ae4 100644 --- a/etl/main.py +++ b/etl/main.py @@ -25,18 +25,18 @@ def main(): # logger.info(pformat(list_of_house_ass_names)) # POC of downloading each file - south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE) - south_coast_scraper.download_file_for_each_address() + # south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE) + # south_coast_scraper.download_file_for_each_address() # POC of pdf reader - DATA_LOC = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/10 Turnberry Close TN38 0WL/PRE SITE NOTES.pdf" - pdfReader = pdfReaderToText(DATA_LOC) - siteNoteReader = pdfReader.get_reader() - logger.warning(siteNoteReader.type) + DATA_LOC_1 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/10 Turnberry Close TN38 0WL/PRE SITE NOTES.pdf" + DATA_LOC_2 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/16 Sunningdale Drive TN38 0WB/PRE SITE NOTES.pdf" - - - # logger.info(south_coast_scraper.surveyor_to_housing_assosications) + pdfReader = pdfReaderToText(DATA_LOC_1) + doc1 = pdfReader.get_reader() + pdfReader = pdfReaderToText(DATA_LOC_2) + doc2 = pdfReader.get_reader() + if __name__ == "__main__": diff --git a/etl/pdfReader/sitenotes.py b/etl/pdfReader/sitenotes.py index 260761f..d7354f3 100644 --- a/etl/pdfReader/sitenotes.py +++ b/etl/pdfReader/sitenotes.py @@ -8,4 +8,59 @@ class SiteNotes(): class QuidosSiteNotes(SiteNotes): def __init__(self, data_list): super().__init__(data_list) - self.type = ReportType.QUIDOS_SITE_NOTE \ No newline at end of file + self.type = ReportType.QUIDOS_SITE_NOTE + self.setup() + + def setup(self): + """ + A function to read QUIDOS SITE REPORT and get all data + """ + # Summary Information + avoid = [ + "Reference Number", + "EPC Language", + "UPRN", + "Postcode", + "Region", + "Address", + "Town", + "County", + "Property Tenure", + "Transaction Type", + "Inspection Date", + 'Assessor’s accreditation number', + 'Assessor’s name', + 'Company name/trading name', + 'Address', + 'POST CODE', + 'Phone number', + 'Fax number', + 'E-mail address', + 'Related party disclosure', + 'Current SAP rating', + 'Potential SAP rating', + 'Current EI rating', + 'Current annual emissions', + 'Current annual energy costs', + 'Emission figures including 9.92 emission factor of 0.925', + ] + get_value = lambda key: None if self.raw_data[self.raw_data.index(key) + 1] in avoid else self.raw_data[self.raw_data.index(key) + 1] + + self.reference_number = get_value('Reference Number') + self.epc_language = get_value('EPC Language') + self.UPRN = get_value('UPRN') + self.postcode = get_value('Postcode') + self.region = get_value('Region') + self.address = get_value('Address') + self.town = get_value('Town') + self.country = get_value('County') + self.property_tenure = get_value('Property Tenure') + self.transaction_type = get_value('Transaction Type') + self.inspection_date = get_value('Inspection Date') + self.assessor_accrediation_number + self.company_trading_name + self.company_post_code + self.company_fax_number + self.company_related_party_disclosure + self.assessor_ +