diff --git a/etl/main.py b/etl/main.py index c0e9a17..d9992ea 100644 --- a/etl/main.py +++ b/etl/main.py @@ -6,9 +6,17 @@ import logging from etl.utils.logger import Logger from etl.validator.validator import DomnaSharePointValidator logger = Logger(name="main.py", level=logging.DEBUG).get_logger() +DATA_LOC_1 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/10 Turnberry Close TN38 0WL/PRE SITE NOTES.pdf" +DATA_LOC_2 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/16 Sunningdale Drive TN38 0WB/PRE SITE NOTES.pdf" + +pdfReader = pdfReaderToText(DATA_LOC_1) +doc1 = pdfReader.get_reader() +pdfReader = pdfReaderToText(DATA_LOC_2) +doc2 = pdfReader.get_reader() def main(): + pass # POC PDF Reader # list_ = pdfReaderToText(INTERESTING_FILE_LOC).get_list_of_test() # pprint(list_) @@ -29,13 +37,7 @@ def main(): # south_coast_scraper.download_file_for_each_address() # POC of pdf reader - DATA_LOC_1 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/10 Turnberry Close TN38 0WL/PRE SITE NOTES.pdf" - DATA_LOC_2 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/16 Sunningdale Drive TN38 0WB/PRE SITE NOTES.pdf" - - pdfReader = pdfReaderToText(DATA_LOC_1) - doc1 = pdfReader.get_reader() - pdfReader = pdfReaderToText(DATA_LOC_2) - doc2 = pdfReader.get_reader() + if __name__ == "__main__": @@ -50,7 +52,6 @@ if __name__ == "__main__": # Work out productivity metirc (number of address in submission folder, with at least one file included) - # Khalim would like these metrics from the pdf # address, uprn, assessor's name validation, current sap rating, current annual emissions. DImension diff --git a/etl/pdfReader/sitenotes.py b/etl/pdfReader/sitenotes.py index 3d86774..9a9a3f6 100644 --- a/etl/pdfReader/sitenotes.py +++ b/etl/pdfReader/sitenotes.py @@ -9,6 +9,7 @@ class SiteNotes(): return [i for i, v in enumerate(lst) if v == value][x] except IndexError: return None # Return None if the value does not occur twice + class QuidosSiteNotes(SiteNotes): @@ -26,6 +27,9 @@ class QuidosSiteNotes(SiteNotes): self.get_section_2() self.get_section_3() self.get_section_4() + self.get_section_5() + self.get_section_6() + self.get_section_7() def get_summary_information(self): # Summary Information @@ -57,7 +61,7 @@ class QuidosSiteNotes(SiteNotes): 'Current annual energy costs', 'Emission figures including 9.92 emission factor of 0.925', ] - get_value = lambda key: None if self.raw_data[self.raw_data.index(key) + 1] in avoid else self.raw_data[self.raw_data.index(key) + 1] + get_value = lambda key: None if self.raw_data[self.raw_data.index(key) + 1] in avoid else self.raw_data[self.raw_data.index(key) + 1] self.reference_number = get_value('Reference Number') self.epc_language = get_value('EPC Language') @@ -107,7 +111,8 @@ class QuidosSiteNotes(SiteNotes): "Detachment/Position", "2.0 Number Of" ] - get_value = lambda x: None if data[data.index(x) + 1] in avoid else data[data.index(x) + 1] + get_value = lambda key: None if self.raw_data[self.raw_data.index(key) + 1] in avoid else self.raw_data[self.raw_data.index(key) + 1] + self.property_type_built_form = get_value("Built Form") self.property_type_detatchment_position = get_value("Detachment/Position") @@ -130,7 +135,8 @@ class QuidosSiteNotes(SiteNotes): "3.0 Date Built", ] - get_value = lambda x: None if data[data.index(x) + 1] in avoid else data[data.index(x) + 1] + get_value = lambda key: None if self.raw_data[self.raw_data.index(key) + 1] in avoid else self.raw_data[self.raw_data.index(key) + 1] + self.main_property = get_value("Main Property") self.extension_1 = get_value('Extension 1') self.extension_2 = get_value('Extension 2') @@ -213,9 +219,69 @@ class QuidosSiteNotes(SiteNotes): setattr(self, f"extension_{i}_dimensions)", create_dimensions_array(f"Extension {i} Property", int(getattr(self, f"extension_{i}")))) else: setattr(self, f"extensions_{i}_dimensions",None) + + def get_section_5(self): + data = self.raw_data[self.raw_data.index('5.0 Conservatory'):self.raw_data.index('7.0 Walls')] + avoid = [ + 'Is there a conservatory?', + '7.0 Walls' + ] + get_value = lambda key: None if self.raw_data[self.raw_data.index(key) + 1] in avoid else self.raw_data[self.raw_data.index(key) + 1] + + self.conservatory = True if get_value("Is there a conservatory?") == "YES" else False + + + def get_section_6(self): + pass + + def get_section_7(self): + data = self.raw_data[self.raw_data.index('7.0 Walls'): self.raw_data.index('8.0 Roofs')] + avoid = [ + "7.0 Walls", + "8.0 Roofs", + "Construction", + "Insulation", + "Insulation Thickness(mm)", + "Wall Thickness Measured?", + "Wall Thickness Measured", + "Wall Thickness(mm)", + "U-value Known?", + "U-value Known", + "U-value (W/m²K)", + "Dry-lining?", + "Alternative Wall Present?", + "Alternative Wall Present", + ] + + get_value = lambda key: None if self.raw_data[self.raw_data.index(key) + 1] in avoid else self.raw_data[self.raw_data.index(key) + 1] + + # Main property + main_info = data[data.index("Main Property"):data.index("Extension 1")] + for i,item in enumerate(main_info): + if item in avoid: + setattr(self, f"main_property_{item.lower().replace(' ', '_').replace('-', '_')}", get_value(item)) + + for j in range(1, 5): + main_data = data[data.index(f"Extension {j}"):] + get_value = lambda key: None if main_data[main_data.index(key) + 1] in avoid else main_data[main_data.index(key) + 1] + for i,item in enumerate(main_data): + if item in avoid: + setattr(self, f"extensions_{j}_{item.lower().replace(' ', '_').replace('-', '_')}", get_value(item)) + + + + + + + + + + def get_section_8(self): + pass - +# validatin +# function of object of type this