diff --git a/etl/main.py b/etl/main.py index d9992ea..a46f282 100644 --- a/etl/main.py +++ b/etl/main.py @@ -11,8 +11,8 @@ DATA_LOC_2 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/16 S pdfReader = pdfReaderToText(DATA_LOC_1) doc1 = pdfReader.get_reader() -pdfReader = pdfReaderToText(DATA_LOC_2) -doc2 = pdfReader.get_reader() +pdfReader2 = pdfReaderToText(DATA_LOC_2) +doc2 = pdfReader2.get_reader() def main(): diff --git a/etl/pdfReader/sitenotes.py b/etl/pdfReader/sitenotes.py index 9a9a3f6..59c727c 100644 --- a/etl/pdfReader/sitenotes.py +++ b/etl/pdfReader/sitenotes.py @@ -30,6 +30,7 @@ class QuidosSiteNotes(SiteNotes): self.get_section_5() self.get_section_6() self.get_section_7() + self.get_section_8() def get_summary_information(self): # Summary Information @@ -237,8 +238,6 @@ class QuidosSiteNotes(SiteNotes): def get_section_7(self): data = self.raw_data[self.raw_data.index('7.0 Walls'): self.raw_data.index('8.0 Roofs')] avoid = [ - "7.0 Walls", - "8.0 Roofs", "Construction", "Insulation", "Insulation Thickness(mm)", @@ -268,17 +267,40 @@ class QuidosSiteNotes(SiteNotes): if item in avoid: setattr(self, f"extensions_{j}_{item.lower().replace(' ', '_').replace('-', '_')}", get_value(item)) + def get_section_8(self): + data = self.raw_data[self.raw_data.index('8.0 Roofs'): self.raw_data.index('9.0 Floors')] + avoid = [ + "Construction", + "Insulation Type", + "Insulation Thickness", + "U-value Known", + ] + + titles = [ + "Main Property", + "Extension 1", + "Extension 2", + "Extension 3", + "Extension 4", + ] + + + title = titles[0].lower().replace(" ", "_").replace("-","_") + proc_data = data + for items in data: + if items in titles: + title = items.lower().replace(" ", "_").replace("-", "_") + index = titles.index(items) + if titles[index] in data: + proc_data = data[data.index(titles[index]):] + continue + else: + break + get_value = lambda key: None if proc_data[proc_data.index(key) + 1] in avoid else proc_data[proc_data.index(key) + 1] + if items in avoid: + setattr(self, f"section_8_{title}_{items.lower().replace(' ', '_').replace('-','_')}", get_value(items)) - - - - - - - def get_section_8(self): - pass - # validatin # function of object of type this