diff --git a/etl/main.py b/etl/main.py index da760fd..a9ac4cb 100644 --- a/etl/main.py +++ b/etl/main.py @@ -14,7 +14,7 @@ pdfReader = pdfReaderToText(DATA_LOC_1) doc2 = pdfReader.get_reader() pdfReader2 = pdfReaderToText(DATA_LOC_2) doc1 = pdfReader2.get_reader() -# vars(doc1) +vars(doc1) def main(): pass diff --git a/etl/pdfReader/pdfReaderToText.py b/etl/pdfReader/pdfReaderToText.py index 9db5376..c27b721 100644 --- a/etl/pdfReader/pdfReaderToText.py +++ b/etl/pdfReader/pdfReaderToText.py @@ -13,7 +13,6 @@ class pdfReaderToText(): self.text_list = [] self.get_text_from_pdf_file() self.type = None - print("everything from scracth") def get_text_from_pdf_file(self): self.logger.debug(f"Extrating text from {self.source_path}") diff --git a/etl/pdfReader/sitenotes.py b/etl/pdfReader/sitenotes.py index 8e21f05..5973792 100644 --- a/etl/pdfReader/sitenotes.py +++ b/etl/pdfReader/sitenotes.py @@ -47,6 +47,10 @@ class QuidosSiteNotes(SiteNotesExtractor): self.get_section_16() self.get_section_17() self.get_section_18() + self.get_section_19() + self.get_section_20() + self.get_section_21() + self.get_section_22() def get_summary_information(self): # Summary Information @@ -510,17 +514,58 @@ class QuidosSiteNotes(SiteNotesExtractor): ] self.two_columns_processor(data, sub_titles, avoid, 18.0) - print("hello seems to khklkjbe") def get_section_19(self): data = self.get_data_between("19.0 Flue Gas Heat Recovery System","20.0 Photovoltaic Panel") sub_titles = [ - + "FGHRS Present", ] avoid = [ - + "19.0 Flue Gas Heat Recovery System", + "20.0 Photovoltaic Panel", ] + self.two_columns_processor(data, sub_titles, avoid, 19) + + def get_section_20(self): + data = self.get_data_between("20.0 Photovoltaic Panel","21.0 Wind Turbine") + print(data) + sub_titles = [ + "PVs are connected to dwelling electricity" + "Percentage of External Roof Area with PVs" + ] + avoid = [ + "20.0 Photovoltaic Panel", + "21.0 Wind Turbine", + ] + self.two_columns_processor(data, sub_titles, avoid, 20) + + def get_section_21(self): + data = self.get_data_between("21.0 Wind Turbine","22.0 Other Details") + sub_titles = [ + "Wind Turbine", + ] + avoid = [ + "21.0 Wind Turbine", + "22.0 Other Details", + ] + self.two_columns_processor(data, sub_titles, avoid, 21) + + def get_section_22(self): + data = self.get_data_between("22.0 Other Details","Recommendations (Carbon Saving Figures Are For Guidance Only)") + sub_titles = [ + "Electricity Meter Type", + "Mains Gas Available", + ] + avoid = [ + "22.0 Other Details", + "Recommendations (Carbon Saving Figures Are For Guidance Only)", + ] + + self.two_columns_processor(data, sub_titles, avoid, 22) + + +# Section 20 and 11, check results for 18 to 22 # Extract # Transform ( wiht validation pydantnic) # Load \ No newline at end of file