From 2fbc330c7ccc3a67a889f2ca62d17c646c22add2 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Wed, 12 Mar 2025 13:20:19 +0000 Subject: [PATCH] use formula to get each section correctly --- etl/pdfReader/sitenotes.py | 45 +++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/etl/pdfReader/sitenotes.py b/etl/pdfReader/sitenotes.py index 9e424bf..b7cbfc5 100644 --- a/etl/pdfReader/sitenotes.py +++ b/etl/pdfReader/sitenotes.py @@ -238,7 +238,7 @@ class QuidosSiteNotes(SiteNotes): def get_section_7(self): data = self.raw_data[self.raw_data.index('7.0 Walls'): self.raw_data.index('8.0 Roofs')] - avoid = [ + sub_titles = [ "Construction", "Insulation", "Insulation Thickness(mm)", @@ -253,24 +253,18 @@ class QuidosSiteNotes(SiteNotes): "Alternative Wall Present", ] - get_value = lambda key: None if self.raw_data[self.raw_data.index(key) + 1] in avoid else self.raw_data[self.raw_data.index(key) + 1] - - # Main property - main_info = data[data.index("Main Property"):data.index("Extension 1")] - for i,item in enumerate(main_info): - if item in avoid: - setattr(self, f"main_property_{item.lower().replace(' ', '_').replace('-', '_')}", get_value(item)) - - for j in range(1, 5): - main_data = data[data.index(f"Extension {j}"):] - get_value = lambda key: None if main_data[main_data.index(key) + 1] in avoid else main_data[main_data.index(key) + 1] - for i,item in enumerate(main_data): - if item in avoid: - setattr(self, f"extensions_{j}_{item.lower().replace(' ', '_').replace('-', '_')}", get_value(item)) + main_titles = [ + "Main Property", + "Extension 1", + "Extension 2", + "Extension 3", + "Extension 4", + ] + self.two_column_with_extension_processor(data, sub_titles, main_titles, 7) def get_section_8(self): data = self.raw_data[self.raw_data.index('8.0 Roofs'): self.raw_data.index('9.0 Floors')] - avoid = [ + sub_titles = [ "Construction", "Insulation Type", "Insulation Thickness", @@ -285,25 +279,26 @@ class QuidosSiteNotes(SiteNotes): "Extension 4", ] - self.two_column_with_extension_processor(data, avoid, titles, 8) + self.two_column_with_extension_processor(data, sub_titles, titles, 8) + + def two_column_with_extension_processor(self, data, sub_titles, main_titles, section): - def two_column_with_extension_processor(self, data, avoid, titles, section): title = None proc_data = data for items in data: - if items in titles: + if items in main_titles: title = items.lower().replace(" ", "_").replace("-", "_") - index = titles.index(items) - if titles[index] in data: - print(titles[index]) - proc_data = data[data.index(titles[index]):] + index = main_titles.index(items) + if main_titles[index] in data: + print(main_titles[index]) + proc_data = data[data.index(main_titles[index]):] continue else: break if title is None: continue - get_value = lambda key: None if proc_data[proc_data.index(key) + 1] in avoid else proc_data[proc_data.index(key) + 1] - if items in avoid: + get_value = lambda key: None if proc_data[proc_data.index(key) + 1] in sub_titles else proc_data[proc_data.index(key) + 1] + if items in sub_titles: setattr(self, f"section_{section}_{title}_{items.lower().replace(' ', '_').replace('-','_')}", get_value(items)) def get_section_9(self):