From bbaddbefa294a7b70ec5c6ebd26585c668a02b15 Mon Sep 17 00:00:00 2001 From: Jun-te Kim Date: Thu, 13 Mar 2025 10:01:59 +0000 Subject: [PATCH] windows scraping is working --- etl/development.py | 4 +- etl/pdfReader/sitenotes.py | 87 ++++++++++++++++++++++++++++++++++---- 2 files changed, 80 insertions(+), 11 deletions(-) diff --git a/etl/development.py b/etl/development.py index f06c74f..1612ee0 100644 --- a/etl/development.py +++ b/etl/development.py @@ -17,8 +17,8 @@ doc1 = pdfReader2.get_reader() vars(doc1) def main(): - south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE, development = True) - south_coast_scraper.download_file_for_each_address() + # south_coast_scraper = SharePointScraper(SharePointInstaller.SOUTH_COAST_INSULATION_SERVICE, development = True) + # south_coast_scraper.download_file_for_each_address() if __name__ == "__main__": main() diff --git a/etl/pdfReader/sitenotes.py b/etl/pdfReader/sitenotes.py index 5973792..d961a2c 100644 --- a/etl/pdfReader/sitenotes.py +++ b/etl/pdfReader/sitenotes.py @@ -36,7 +36,7 @@ class QuidosSiteNotes(SiteNotesExtractor): self.get_section_8() self.get_section_9() self.get_section_10() - # self.get_section_11() + self.get_section_11() self.get_section_12() self.get_section_13() self.get_section_14() @@ -359,11 +359,11 @@ class QuidosSiteNotes(SiteNotesExtractor): ] self.two_columns_processor(data, sub_titles, avoid, 10) - def two_columns_processor(self, data, sub_titles_to_gather, avoid, section): + def two_columns_processor(self, data, sub_titles_to_gather, avoid, section, indexAdd = 1): def get_value(key): try: index = data.index(key) - value = data[index + 1] + value = data[index + indexAdd] return None if value in avoid else value except (ValueError, IndexError): return None @@ -375,7 +375,66 @@ class QuidosSiteNotes(SiteNotesExtractor): setattr(self, f"section_{section}_{items.lower().replace('-', '_').replace(' ','_')}", get_value(items)) def get_section_11(self): - raise RuntimeError("Please complete me") + data = self.get_data_between("Window Location", "12.0 Ventilation & Cooling") + headers = data[:8] + data_entries = data[8:] + + num_attributes = 5 + subtitles=[ + "Main Property", + "Extension 1", + "Extension 2", + "Extension 3", + "Extension 4", + ] + + orientation = [ + "north", + "east", + "west", + "south", + "n", + "w", + "s", + "e", + "nw", + "ne", + "sw", + "se", + "south west", + "south east", + "north west", + "north east", + + ] + + def find_compose_index(lst, compose): + for i, item in enumerate(lst): + if item.lower() in compose: + return i + return None + + title = None + until = 0 + for i, items in enumerate(data_entries): + if data_entries[i] in subtitles: + title = data_entries[i].lower().replace(" ", "_").replace("-", "_") + setattr(self, f"section_11_{title}_window", []) + if title and until == i: + entry = data_entries[i:] + index = find_compose_index(entry,orientation) + new_entry = entry[index-3:index+3] + dict_ = { + "glazing type": new_entry[0], + "Area (m2)": new_entry[1], + "Roof Window": new_entry[2], + "Orientation": new_entry[3], + "U-value (W/m²K)": new_entry[4], + "g-value": new_entry[5], + } + lst = getattr(self, f"section_11_{title}_window") + lst.append(dict_) + until = index + 3 + i def get_section_12(self): data = self.raw_data[self.raw_data.index('12.0 Ventilation & Cooling'): self.raw_data.index('13.0 Lighting')] @@ -505,8 +564,6 @@ class QuidosSiteNotes(SiteNotesExtractor): data = self.get_data_between("18.0 Showers And Baths", "19.0 Flue Gas Heat Recovery System") sub_titles = [ "Number of Rooms with Bath and/or Shower", - "Number of Rooms with Mixer Shower and no Bath", - "Number of Rooms with Mixer Shower and Bath", ] avoid = [ "18.0 Showers And Baths", @@ -514,6 +571,15 @@ class QuidosSiteNotes(SiteNotesExtractor): ] self.two_columns_processor(data, sub_titles, avoid, 18.0) + avoid = [ + "18.0 Showers And Baths", + "19.0 Flue Gas Heat Recovery System", + ] + sub_titles = [ + "Number of Rooms with Mixer Shower and no", # Number of Rooms with Mixer Shower and no Bath + "Number of Rooms with Mixer Shower and", # Number of Rooms with Mixer Shower and Bath + ] + self.two_columns_processor(data, sub_titles, avoid, 18.0, 2) def get_section_19(self): data = self.get_data_between("19.0 Flue Gas Heat Recovery System","20.0 Photovoltaic Panel") @@ -529,9 +595,7 @@ class QuidosSiteNotes(SiteNotesExtractor): def get_section_20(self): data = self.get_data_between("20.0 Photovoltaic Panel","21.0 Wind Turbine") - print(data) sub_titles = [ - "PVs are connected to dwelling electricity" "Percentage of External Roof Area with PVs" ] avoid = [ @@ -539,6 +603,11 @@ class QuidosSiteNotes(SiteNotesExtractor): "21.0 Wind Turbine", ] self.two_columns_processor(data, sub_titles, avoid, 20) + + sub_titles = [ + "PVs are connected to dwelling electricity" # PVs are connected to dwelling electricity meter + ] + self.two_columns_processor(data, sub_titles, avoid, 20, 2) def get_section_21(self): data = self.get_data_between("21.0 Wind Turbine","22.0 Other Details") @@ -565,7 +634,7 @@ class QuidosSiteNotes(SiteNotesExtractor): self.two_columns_processor(data, sub_titles, avoid, 22) -# Section 20 and 11, check results for 18 to 22 +# Section and 11 # Extract # Transform ( wiht validation pydantnic) # Load \ No newline at end of file