From 7aa694a5ff151921993927cc3d7fcb17bb1ca597 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 12 Mar 2025 12:54:48 +0000 Subject: [PATCH] made into a function for two column things --- etl/main.py | 4 ++-- etl/pdfReader/sitenotes.py | 35 +++++++++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/etl/main.py b/etl/main.py index a46f282..7d70ed7 100644 --- a/etl/main.py +++ b/etl/main.py @@ -10,9 +10,9 @@ DATA_LOC_1 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/10 T DATA_LOC_2 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/16 Sunningdale Drive TN38 0WB/PRE SITE NOTES.pdf" pdfReader = pdfReaderToText(DATA_LOC_1) -doc1 = pdfReader.get_reader() +doc2 = pdfReader.get_reader() pdfReader2 = pdfReaderToText(DATA_LOC_2) -doc2 = pdfReader2.get_reader() +doc1 = pdfReader2.get_reader() def main(): diff --git a/etl/pdfReader/sitenotes.py b/etl/pdfReader/sitenotes.py index 59c727c..9e424bf 100644 --- a/etl/pdfReader/sitenotes.py +++ b/etl/pdfReader/sitenotes.py @@ -31,6 +31,7 @@ class QuidosSiteNotes(SiteNotes): self.get_section_6() self.get_section_7() self.get_section_8() + self.get_section_9() def get_summary_information(self): # Summary Information @@ -283,22 +284,48 @@ class QuidosSiteNotes(SiteNotes): "Extension 3", "Extension 4", ] - - - title = titles[0].lower().replace(" ", "_").replace("-","_") + + self.two_column_with_extension_processor(data, avoid, titles, 8) + + def two_column_with_extension_processor(self, data, avoid, titles, section): + title = None proc_data = data for items in data: if items in titles: title = items.lower().replace(" ", "_").replace("-", "_") index = titles.index(items) if titles[index] in data: + print(titles[index]) proc_data = data[data.index(titles[index]):] continue else: break + if title is None: + continue get_value = lambda key: None if proc_data[proc_data.index(key) + 1] in avoid else proc_data[proc_data.index(key) + 1] if items in avoid: - setattr(self, f"section_8_{title}_{items.lower().replace(' ', '_').replace('-','_')}", get_value(items)) + setattr(self, f"section_{section}_{title}_{items.lower().replace(' ', '_').replace('-','_')}", get_value(items)) + + def get_section_9(self): + data = self.raw_data[self.raw_data.index('9.0 Floors'): self.raw_data.index('10.0 Doors')] + avoid = [ + "Floor Type", + "Ground Floor Construction", + "Ground Floor Insulation Type", + "Floor Insulation Thickness (mm)", + "U-value Known", + ] + + titles = [ + "Main Property", + "Extension 1", + "Extension 2", + "Extension 3", + "Extension 4", + ] + + + self.two_column_with_extension_processor(data, avoid, titles, 9)