diff --git a/.vscode/settings.json b/.vscode/settings.json index 2ec73ee..e8c08c6 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,4 +1,11 @@ { "jupyter.interactiveWindow.textEditor.executeSelection": true, "python.REPL.sendToNativeREPL": true + + // Hot reload setting that needs to be in user settings + // "jupyter.runStartupCommands": [ + // "%load_ext autoreload", "%autoreload 2" + // ] + + } \ No newline at end of file diff --git a/etl/main.py b/etl/main.py index c98d63c..da760fd 100644 --- a/etl/main.py +++ b/etl/main.py @@ -5,6 +5,7 @@ from pprint import pprint, pformat import logging from etl.utils.logger import Logger from etl.validator.validator import DomnaSharePointValidator + logger = Logger(name="main.py", level=logging.DEBUG).get_logger() DATA_LOC_1 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/10 Turnberry Close TN38 0WL/PRE SITE NOTES.pdf" DATA_LOC_2 = "/tmp/sharepoint/Abdul Koddus/W.C. 03.03.2025/Southern Housing/16 Sunningdale Drive TN38 0WB/PRE SITE NOTES.pdf" @@ -13,8 +14,7 @@ pdfReader = pdfReaderToText(DATA_LOC_1) doc2 = pdfReader.get_reader() pdfReader2 = pdfReaderToText(DATA_LOC_2) doc1 = pdfReader2.get_reader() -vars(doc1) - +# vars(doc1) def main(): pass @@ -34,8 +34,8 @@ def main(): # logger.info(pformat(list_of_house_ass_names)) # POC of downloading each file - # south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE) - # south_coast_scraper.download_file_for_each_address() + south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE) + south_coast_scraper.download_file_for_each_address() # POC of pdf reader diff --git a/etl/pdfReader/pdfReaderToText.py b/etl/pdfReader/pdfReaderToText.py index c27b721..9db5376 100644 --- a/etl/pdfReader/pdfReaderToText.py +++ b/etl/pdfReader/pdfReaderToText.py @@ -13,6 +13,7 @@ class pdfReaderToText(): self.text_list = [] self.get_text_from_pdf_file() self.type = None + print("everything from scracth") def get_text_from_pdf_file(self): self.logger.debug(f"Extrating text from {self.source_path}") diff --git a/etl/pdfReader/sitenotes.py b/etl/pdfReader/sitenotes.py index 86c25b0..8e21f05 100644 --- a/etl/pdfReader/sitenotes.py +++ b/etl/pdfReader/sitenotes.py @@ -10,6 +10,9 @@ class SiteNotesExtractor(): except IndexError: return None # Return None if the value does not occur twice + def get_data_between(self, a, b): + return self.raw_data[self.raw_data.index(a):self.raw_data.index(b)] + class QuidosSiteNotes(SiteNotesExtractor): @@ -42,6 +45,8 @@ class QuidosSiteNotes(SiteNotesExtractor): self.get_section_15_0() self.get_section_15_1() self.get_section_16() + self.get_section_17() + self.get_section_18() def get_summary_information(self): # Summary Information @@ -488,6 +493,34 @@ class QuidosSiteNotes(SiteNotesExtractor): ] self.two_columns_processor(data, sub_titles, avoid, 16.0) + + def get_section_17(self): + pass + + def get_section_18(self): + data = self.get_data_between("18.0 Showers And Baths", "19.0 Flue Gas Heat Recovery System") + sub_titles = [ + "Number of Rooms with Bath and/or Shower", + "Number of Rooms with Mixer Shower and no Bath", + "Number of Rooms with Mixer Shower and Bath", + ] + avoid = [ + "18.0 Showers And Baths", + "19.0 Flue Gas Heat Recovery System", + ] + + self.two_columns_processor(data, sub_titles, avoid, 18.0) + print("hello seems to khklkjbe") + + def get_section_19(self): + data = self.get_data_between("19.0 Flue Gas Heat Recovery System","20.0 Photovoltaic Panel") + sub_titles = [ + + ] + avoid = [ + + ] + # Extract # Transform ( wiht validation pydantnic) # Load \ No newline at end of file