From fff8f50f69cad56ffe353bdf2ab0aa6f2d12573e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 27 Nov 2024 10:16:06 +0000 Subject: [PATCH 01/23] wave 3 applications closed --- etl/customers/cottons/parse_pdf_asset_list.py | 64 +++++++++++++++++++ etl/customers/cottons/prep_asset_list.py | 15 +++++ etl/customers/gla/hug_postcodes.py | 46 +++++++++++++ etl/customers/ksquared/Wave3 Modelling.py | 2 +- .../stonewater/Wave 3 Preparation.py | 16 +++++ .../stonewater/potential_eco_properties.py | 38 +++++++++++ etl/route_march_data_pull/app.py | 21 +++--- 7 files changed, 191 insertions(+), 11 deletions(-) create mode 100644 etl/customers/cottons/parse_pdf_asset_list.py create mode 100644 etl/customers/cottons/prep_asset_list.py create mode 100644 etl/customers/gla/hug_postcodes.py diff --git a/etl/customers/cottons/parse_pdf_asset_list.py b/etl/customers/cottons/parse_pdf_asset_list.py new file mode 100644 index 00000000..7d442e97 --- /dev/null +++ b/etl/customers/cottons/parse_pdf_asset_list.py @@ -0,0 +1,64 @@ +import re +import pandas as pd +from PyPDF2 import PdfReader + +# Paths to the uploaded files +file_paths = [ + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged).pdf", + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 2.pdf", + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 3.pdf", + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 4.pdf", + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 5.pdf", + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 6.pdf" +] + + +# Function to extract text from PDFs +def extract_text_from_pdf_with_pypdf2(file_path): + text = "" + reader = PdfReader(file_path) + for page in reader.pages: + text += page.extract_text() + return text + + +# Initialize a list to hold all parsed data +all_parsed_data = [] + +# Process each PDF individually +for i, path in enumerate(file_paths): + # Extract text from the PDF + extracted_text = extract_text_from_pdf_with_pypdf2(path) + + # Step 1: Remove titles and repeated headers + cleaned_text = re.sub(r"Managed Property Report as at \d+ \w+ \d+", "", extracted_text) + cleaned_text = re.sub(r"Code Property Address Management Type", "", cleaned_text) + + # Step 2: Extract rows ending with "Managed" + rows = re.findall(r".*?Managed", cleaned_text) + + # Step 3: Parse rows into structured data + parsed_data = [] + for row in rows: + match = re.match(r"(\S+)\s+(.+?)\s+Managed", row.strip()) + if match: + code = match.group(1).strip() + address = match.group(2).strip() + parsed_data.append((code, address, "Managed")) + + # Append parsed data to the global list + all_parsed_data.extend(parsed_data) + + # Provide feedback for debugging + print(f"File {i + 1} processed: {len(parsed_data)} rows") + +# Step 4: Create a unified DataFrame +final_df = pd.DataFrame(all_parsed_data, columns=["Code", "Property Address", "Management Type"]) + +# Step 5: Save the unified DataFrame to an Excel file +final_output_file_path = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unified_Managed_Properties_List.xlsx" +final_df.to_excel(final_output_file_path, index=False) + +# Provide feedback +print(f"All files processed and combined. Total rows: {len(final_df)}") +print(f"Unified file saved to: {final_output_file_path}") diff --git a/etl/customers/cottons/prep_asset_list.py b/etl/customers/cottons/prep_asset_list.py new file mode 100644 index 00000000..db7c6583 --- /dev/null +++ b/etl/customers/cottons/prep_asset_list.py @@ -0,0 +1,15 @@ +import pandas as pd + +df = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List.xlsx" +) + +# split up the address on commas. First section is address1, last seciton is postcode +df["address1"] = df["Property Address"].apply(lambda x: x.split(",")[0].strip()) +df["postcode"] = df["Property Address"].apply(lambda x: x.split(",")[-1].strip()) + +# Re-save +df.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List.xlsx", + index=False, +) diff --git a/etl/customers/gla/hug_postcodes.py b/etl/customers/gla/hug_postcodes.py new file mode 100644 index 00000000..85783d62 --- /dev/null +++ b/etl/customers/gla/hug_postcodes.py @@ -0,0 +1,46 @@ +import inspect +import pandas as pd +from pathlib import Path +from tqdm import tqdm +from etl.epc.settings import EARLIEST_EPC_DATE + +src_file_path = inspect.getfile(lambda: None) + +EPC_DIRECTORY = Path("/Users/khalimconn-kowlessar/Downloads/all-domestic-certificates") +epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()] + +aggregation = [] +for directory in tqdm(epc_directories): + data = pd.read_csv(directory / "certificates.csv", low_memory=False) + # Rename the columns to the same format as the api returns + data.columns = [c.replace("_", "-").lower() for c in data.columns] + + data = data[data["posttown"].str.contains("London", case=False, na=False)] + if data.empty: + continue + # Take just date before the date threshold + data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE] + + data = data[~pd.isnull(data["uprn"])] + # Take just the newest EPC per uprn, based on lodgement-date + data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn") + # Take EPC D and below + data = data[data["current-energy-rating"].isin(["D", "E", "F", "G"])] + data["postal_region"] = data["postcode"].str.split(" ").str[0] + + # Take homes that don't have a gas boiler + off_gas = data[~data["main-fuel"].str.contains("mains gas", case=False, na=False)] + + region_summary = off_gas.groupby("postal_region").size().reset_index(name="count") + + aggregation.append(region_summary) + +postal_region_aggregation = pd.concat(aggregation) +postal_region_aggregation = postal_region_aggregation.sort_values("count", ascending=False) +postal_region_aggregation = postal_region_aggregation.rename( + columns={"postal_region": "Postcode Region", "count": "Number of Homes"} +) +postal_region_aggregation.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/GLA/Off Gas EPC D-G Postal Regions.xlsx", + index=False +) diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py index 7bfa33b3..0bf6eb18 100644 --- a/etl/customers/ksquared/Wave3 Modelling.py +++ b/etl/customers/ksquared/Wave3 Modelling.py @@ -305,7 +305,7 @@ def caha(): # Get conservation area data uprns = [x["uprn"] for x in extracted_data if x["uprn"] not in ["", None]] - conservation_area_data = OpenUprnClient.get_spatial_data([100022526362], "retrofit-data-dev") + conservation_area_data = OpenUprnClient.get_spatial_data([36284], "retrofit-data-dev") addresses = pd.DataFrame(asset_list) addresses["uprn"] = addresses["uprn"].astype(str) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index b6c29863..77200e69 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2591,5 +2591,21 @@ def propsed_wave_3_sample(): os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv"), index=False ) + survey_results = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.19 V2.xlsx"), + header=13, + sheet_name="Modelled Packages" + ) + + indivual_units = pd.read_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv") + ) + + u_aids = survey_results["Archetype ID"].astype(str).unique() + units_in_bid = indivual_units[indivual_units['Unit in Programme']]["Archetype ID"].astype(str).values + + len({v for v in units_in_bid if str(v) in u_aids}) + len(list(set(units_in_bid))) + # if __name__ == "__main__": # main() diff --git a/etl/customers/stonewater/potential_eco_properties.py b/etl/customers/stonewater/potential_eco_properties.py index 4fb89113..c0301e9a 100644 --- a/etl/customers/stonewater/potential_eco_properties.py +++ b/etl/customers/stonewater/potential_eco_properties.py @@ -375,3 +375,41 @@ def app(): "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - WIP.csv", index=False ) + + +def cross_reference_epc_programme(): + eco3_fallout = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/STONEWATER LIST OF ADDRESSES TO BE " + "SURVEYED - ECO3 NOT COMPLETED.xlsx" + ) + + eco3_fallout["house_number"] = eco3_fallout.apply( + lambda x: SearchEpc.get_house_number(x["ADDRESS"], ""), axis=1 + ) + + # for _, x in eco3_fallout.ite + + stonewater_modelled_above_c = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - " + "master sheet.csv", + encoding='latin1' + ) + + stonewater_modelled_above_c["house_number"] = stonewater_modelled_above_c.apply( + lambda x: SearchEpc.get_house_number(x["Address"], x["Postcode"]), axis=1 + ) + + eco3_fallout_matched_to_above_c = [] + for _, property in eco3_fallout.iterrows(): + # Match on house number + match = stonewater_modelled_above_c[ + stonewater_modelled_above_c["house_number"] == property["house_number"] + ] + + # We do a fuzzy match on the address, with levenstein distance + + from fuzzywuzzy import fuzz + match = stonewater_modelled_above_c[ + stonewater_modelled_above_c["Address"].apply(lambda x: fuzz.ratio(x, property["ADDRESS"]) > 90) + ] + match.head() diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 6f9dd135..b53b36c2 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -120,17 +120,17 @@ def app(): Property UPRN """ - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/" - DATA_FILENAME = "Bromford programme review.xlsx" - SHEET_NAME = "Bromford" - POSTCODE_COLUMN = "Postcode" - FULLADDRESS_COLUMN = None - ADDRESS1_COLUMN = "No." - ADDRESS1_METHOD = "first_two_words" - ADDRESS_COLS_TO_CONCAT = ["No.", "Address"] + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/" + DATA_FILENAME = "Cottons Asset List.xlsx" + SHEET_NAME = "Sheet1" + POSTCODE_COLUMN = "postcode" + FULLADDRESS_COLUMN = "Property Address" + ADDRESS1_COLUMN = "address1" + ADDRESS1_METHOD = None + ADDRESS_COLS_TO_CONCAT = [] asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) - asset_list = asset_list[~pd.isnull(asset_list["Postcode"])] + # asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index() asset_list["row_id"] = asset_list.index # We clean up portential non-breaking spaces, and double spaces @@ -202,7 +202,8 @@ def app(): transformed_df = pd.DataFrame(transformed_data) # Drop the column that is "" - transformed_df = transformed_df.drop(columns=[""]) + if "" in transformed_df.columns: + transformed_df = transformed_df.drop(columns=[""]) # Get the find my epc data find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join( From 965cf975e289b11bd1387a55c251e1c50e0327e0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 27 Nov 2024 13:08:03 +0000 Subject: [PATCH 02/23] setting up data extraction pilot --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/lodgement/app.py | 47 +++++ etl/lodgement/requirements.txt | 8 + utils/file_data_extraction.py | 343 +++++++++++++++++++++++++++++++++ 5 files changed, 400 insertions(+), 2 deletions(-) create mode 100644 etl/lodgement/app.py create mode 100644 etl/lodgement/requirements.txt create mode 100644 utils/file_data_extraction.py diff --git a/.idea/Model.iml b/.idea/Model.iml index 762580d9..9b63b142 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index c916a158..acd935c1 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py new file mode 100644 index 00000000..ede644b8 --- /dev/null +++ b/etl/lodgement/app.py @@ -0,0 +1,47 @@ +import os +import utils.file_data_extraction as file_extraction_tools + + +def handler(): + """ + This is a simple application that will extract the data from documents that have been uploaded to Sharepoint + to populate the lodgement spreadsheet with + :return: + """ + + # Ths source data will eventually come from Sharepoint + source_data_path = "/Users/khalimconn-kowlessar/Documents/hestia/Lodgment Pilot" + output_template = "Trustmark Details - Template REV.25.11.24.xlsx" + + # List the folders in the source data path + folders = [x for x in os.listdir(source_data_path) if os.path.isdir(os.path.join(source_data_path, x))] + + extractors = { + "elmhurst epr": file_extraction_tools.ElmhurstEprExtractor, + "elmhurst summary report": None, + "osmosis condition report": None, + "elmhurst evidence report": None + } + + for property_folder in folders: + coordinator_folder = os.path.join(source_data_path, property_folder, "2. RA Coordinator Info") + + # Get the contents of the folder + coordinator_folder_contents = [ + file for file in os.listdir(coordinator_folder) if os.path.isfile(os.path.join(coordinator_folder, file)) + ] + + # We detect the various file types + extracted_contents = {} + for filename in coordinator_folder_contents: + filepath = os.path.join(coordinator_folder, filename) + if file_extraction_tools.is_pdf(filepath): + report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath) + if report_type is None: + raise ValueError(f"Unknown report type for {filename}") + + file_extractor = extractors.get(report_type) + if file_extractor is None: + continue + + extracted_contents[report_type] = file_extractor(filepath).extract() diff --git a/etl/lodgement/requirements.txt b/etl/lodgement/requirements.txt new file mode 100644 index 00000000..601907ed --- /dev/null +++ b/etl/lodgement/requirements.txt @@ -0,0 +1,8 @@ +PyPDF2 +pandas +tqdm +openpyxl +boto3 +usaddress==0.5.11 +fuzzywuzzy==0.18.0 +python-dotenv \ No newline at end of file diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py new file mode 100644 index 00000000..cdd25f8a --- /dev/null +++ b/utils/file_data_extraction.py @@ -0,0 +1,343 @@ +import PyPDF2 +import re +from collections import Counter + +""" +This script contains functions used to extract data from retrofit survey files, including EPRs, +summary reports, etc +""" + + +def is_elmhurst_energy_report(text): + """ + Determines if the provided text indicates that the PDF is an Energy Report. + Returns True if the text contains 'Energy Report'. + """ + return text.startswith("ENERGY REPORT") + + +def is_elmhurst_summary_report(text): + """ + Determines if the provided text indicates that the PDF is a Summary Report. + """ + return text.startswith("Summary Information") + + +def is_osmosis_condition_report(text): + """ + Determines if the provided text indicates that the PDF is a Condition Report. + """ + return text.startswith("OsmosisACDNEWPAS2035ConditionReport") or text.startswith("OsmosisACDPAS2035ConditionReport") + + +def is_elmhurst_evidence_report(text): + """ + Determines if the provided text indicates that the PDF is an Elmhurst Evidence Report. + """ + return text.startswith("RdSAP Evidence Report") + + +def detect_pdf_report_type(pdf_path): + """ + Detects the type of report based on content or filename. + :param pdf_path: String path to the PDF file + :param pdf_file: String name of the PDF file + :return: String type of the report ("epr", "summary", or None) + """ + # Attempt to read the first page of the PDF to determine type + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + first_page_text = reader.pages[0].extract_text() if reader.pages else "" + + if is_elmhurst_energy_report(first_page_text): + return "elmhurst epr" + elif is_elmhurst_summary_report(first_page_text): + return "elmhurst summary report" + elif is_osmosis_condition_report(first_page_text): + return "osmosis condition report" + elif is_elmhurst_evidence_report(first_page_text): + return "elmhurst evidence report" + + return None + + +def is_pdf(filename): + """ + Determines if the provided filename is a PDF file. + """ + return filename.endswith(".pdf") + + +class ElmhurstEprExtractor: + def __init__(self, file_path): + self.file_path = file_path + + @staticmethod + def extract_window_age_description(windows_text): + """ + Extracts the most common window age description and its proportion. + """ + windows_text = windows_text.replace("\n", "") + window_descriptions = [ + "Double post or during 2002", + "Double pre 2002", + "Double with unknown install date", + "Secondary glazing", + "Triple glazing", + "Single glazing", + ] + description_counts = Counter() + for description in window_descriptions: + matches = re.findall(re.escape(description), windows_text) + description_counts[description] = len(matches) + + if not description_counts or not sum(description_counts.values()): + raise ValueError("Failed to extract window data.") + + most_common_description, window_count = description_counts.most_common(1)[0] + window_proportion = window_count / sum(description_counts.values()) * 100 + + if window_proportion == 100: + second_most_common_description = None + second_most_common_proportion = 0 + else: + second_most_common_description, second_window_count = description_counts.most_common(2)[1] + second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100 + + return { + "Window Age Description": most_common_description, + "Window Age Description Proportion (%)": window_proportion, + "Secondary Window Age Description": second_most_common_description, + "Secondary Window Age Description Proportion (%)": second_most_common_proportion, + "Number of Windows": sum(description_counts.values()) + } + + @staticmethod + def extract_building_parts(text): + """ + Extracts building parts and associated dimensions from the provided text. + """ + data = [] + building_part_pattern = re.compile( + r"Construction details: Building part: (.*?)\nFloor Area \[m2\] Room Height \[m\] Perimeter \[m\] Party " + r"Wall Length \[m\]\n(.*?)(?=Construction details|Data inputs|$)", + re.DOTALL + ) + for match in building_part_pattern.finditer(text): + part_name = match.group(1).strip() + floor_data = match.group(2) + room_in_roof_match = re.search(r"Room\(s\) in Roof area:\s*([\d.]+)", part_name) + if room_in_roof_match: + floor_area = float(room_in_roof_match.group(1)) + cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() + data.append({ + "Building Part": cleaned_part_name, + "Floor Level": "Room in Roof", + "Floor Area (m2)": floor_area, + "Room Height (m)": None, + "Perimeter (m)": None, + "Party Wall Length (m)": None + }) + else: + cleaned_part_name = re.sub(r" - built in.*", "", part_name).strip() + + floor_pattern = re.compile( + r"(Lowest floor|First floor|Second floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + ) + for floor_match in floor_pattern.finditer(floor_data): + floor_level = floor_match.group(1) + floor_area = float(floor_match.group(2)) + room_height = float(floor_match.group(3)) + perimeter = float(floor_match.group(4)) + party_wall_length = float(floor_match.group(5)) + data.append({ + "Building Part": cleaned_part_name, + "Floor Level": floor_level, + "Floor Area (m2)": floor_area, + "Room Height (m)": room_height, + "Perimeter (m)": perimeter, + "Party Wall Length (m)": party_wall_length + }) + + return data + + @staticmethod + def extract_roof_details(text): + """ + Extracts roof details for each building part in the provided text. + """ + roof_data = [] + building_part_pattern = re.compile( + r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)", + re.DOTALL + ) + for match in building_part_pattern.finditer(text): + part_name = match.group(1).strip() + cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() + part_details = match.group(2) + roof_type_match = re.search(r"Roof Type:\s*(.*?)(?=\n|$)", part_details) + roof_insulation_match = re.search(r"Roof Insulation:\s*(.*?)(?=\n|$)", part_details) + roof_insulation_thickness_match = re.search(r"Roof Insulation Thickness:\s*(.*?)(?=\n|$)", part_details) + + roof_data.append({ + "Building Part": cleaned_part_name, + "Roof Type": roof_type_match.group(1).strip() if roof_type_match else None, + "Roof Insulation": roof_insulation_match.group(1).strip() if roof_insulation_match else None, + "Roof Insulation Thickness": roof_insulation_thickness_match.group( + 1).strip() if roof_insulation_thickness_match else None, + }) + + return roof_data + + @staticmethod + def extract_wall_details(text): + """ + Extracts wall details for each building part in the provided text. + """ + wall_data = [] + building_part_pattern = re.compile( + r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)", + re.DOTALL + ) + for match in building_part_pattern.finditer(text): + part_name = match.group(1).strip() + cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() + part_details = match.group(2) + wall_type_match = re.search(r"Wall Type:\s*(.*?)(?=\n|$)", part_details) + wall_insulation_match = re.search(r"Wall Insulation:\s*(.*?)(?=\n|$)", part_details) + wall_drylining_match = re.search(r"Wall Dry-lining:\s*(.*?)(?=\n|$)", part_details) + wall_thickness_match = re.search(r"Wall Thickness:\s*(\d+)(?=\n|$)", part_details) + + wall_data.append({ + "Building Part": cleaned_part_name, + "Wall Type": wall_type_match.group(1).strip() if wall_type_match else None, + "Wall Insulation": wall_insulation_match.group(1).strip() if wall_insulation_match else None, + "Wall Dry-lining": wall_drylining_match.group(1).strip() if wall_drylining_match else None, + "Wall Thickness": int(wall_thickness_match.group(1)) if wall_thickness_match else None, + }) + + return wall_data + + @staticmethod + def extract_primary_heating(text): + + # Extract Primary Heating Section (Main Heating 1) + primary_heating_section1 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL) + # We may not have a secondary heating + primary_heating_section2 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Secondary\s*Heating", text, re.DOTALL) + primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 + primary_text = primary_heating_section.group(1) + + primary_heating_output = { + "Existing Primary Heating System": re.search( + r"Main Heating Code\s*(.*?)\n", primary_text + ).group(1).strip(), + "Existing Primary Heating PCDF Reference": re.search( + r"PCDF boiler Reference\s*(\d+)", primary_text + ).group(1), + "Existing Primary Heating Controls": re.search( + r"Main Heating Controls\s*(.*?)\n", primary_text + ).group(1).strip(), + "Existing Primary Heating % of Heat": int( + re.search(r"Percentage of Heat\s*(\d+)\s*%?", primary_text).group(1) + ) + } + + return primary_heating_output + + @staticmethod + def extract_secondary_heating(text): + # Extract Secondary Heating Section (Main Heating 2) + secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL) + output = {} + if secondary_heating_section is None: + output["Existing Heating System"] = "" + output["Existing Heating PCDF Reference"] = "" + output["Existing Heating Controls"] = "" + output["Existing Heating % of Heat"] = 0 + + else: + secondary_text = secondary_heating_section.group(1) + + main_heating_code_match_secondary = re.search( + r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text + ) + output["Existing Heating System"] = main_heating_code_match_secondary.group(1).strip() + + output["Existing Heating PCDF Reference"] = re.search( + r"PCDF boiler Reference\s*(\d+)", secondary_text + ).group(1) + + if output["Existing Heating System"] == "": + output["Existing Heating Controls"] = "" + else: + # Might not have heating controls on 2nd system + secondary_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) + output["Existing Heating Controls"] = ( + secondary_controls_match.group(1).strip() if secondary_controls_match else "" + ) + output["Existing Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1) + ) + + return output + + def extract(self): + data = {} + + with open(self.file_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + text = "".join(page.extract_text() for page in reader.pages) + + # Extracting individual components + address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL) + data["Address"] = address_match.group(1).strip() + data["Postcode"] = data["Address"].split(",")[-1].strip() + + sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text) + data["Current SAP Rating"] = int(sap_match.group(1)) + + energy_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text) + data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(energy_match.group(1)) + + storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) + data["Number of Storeys"] = int(storeys_match.group(1)) + + fuel_match = re.search(r"TOTAL\s*£(\d+)", text) + data["Fuel Bill"] = f"£{fuel_match.group(1)}" + + total_doors_match = re.search(r"Total Doors:\s*(\d+)", text) + data["Total Number of Doors"] = int(total_doors_match.group(1)) + + # Extract Number of Insulated Doors + insulated_doors_match = re.search(r"Insulated Doors:\s*(\d+)", text) + data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) + + # Get number of lighting outlets and number of fittings needing LEL + lighting_fittings_match = re.search(r"Total number of light fittings\s*(\d+)", text) + data["Number of Light Fittings"] = int(lighting_fittings_match.group(1)) + lel_fittings_match = re.search(r"Total number of L.E.L. fittings\s*(\d+)", text) + data["Number of LEL Fittings"] = int(lel_fittings_match.group(1)) + data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + + windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) + data["Windows"] = self.extract_window_age_description(windows_section.group(1)) + + data["Primary Heating"] = self.extract_primary_heating(text) + data["Secondary Heating"] = self.extract_secondary_heating(text) + data["Building Parts"] = self.extract_building_parts(text) + data["Roof Details"] = self.extract_roof_details(text) + data["Wall Details"] = self.extract_wall_details(text) + + secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) + water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) + + if data["Secondary Heating"]["Existing Heating System"] == "": + data["Secondary Heating Code"] = "" + else: + data["Secondary Heating Code"] = secondary_heating_code_match.group( + 1).strip() if secondary_heating_code_match else "" + + data["Water Heating Code"] = water_heating_code_match.group(1).strip() + + return data From 0efd0163ee9ad103b97ef6ebdf4419c580f305b3 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 27 Nov 2024 13:30:36 +0000 Subject: [PATCH 03/23] refactoring epr extraction --- etl/lodgement/app.py | 1 - utils/file_data_extraction.py | 126 ++++++++++++++++++++-------------- 2 files changed, 76 insertions(+), 51 deletions(-) diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index ede644b8..a395508c 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -43,5 +43,4 @@ def handler(): file_extractor = extractors.get(report_type) if file_extractor is None: continue - extracted_contents[report_type] = file_extractor(filepath).extract() diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index cdd25f8a..15b183dc 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -69,6 +69,10 @@ def is_pdf(filename): class ElmhurstEprExtractor: + """ + A utility class for extracting specific data from Elmhurst Energy Performance Reports (EPR). + """ + def __init__(self, file_path): self.file_path = file_path @@ -219,7 +223,30 @@ class ElmhurstEprExtractor: return wall_data @staticmethod - def extract_primary_heating(text): + def _extract_heating_details(section_text): + """ + Extracts heating details from a given section of text. + + Args: + section_text (str): The section of text containing heating details. + + Returns: + dict: A dictionary containing heating system details. + """ + + system_search = re.search(r"Main Heating Code\s*(.*?)\n", section_text) + pcdf_search = re.search(r"PCDF boiler Reference\s*(\d+)", section_text) + controls_search = re.search(r"Main Heating Controls\s*(.*?)\n", section_text) + heat_search = re.search(r"Percentage of Heat\s*(\d+)\s*%?", section_text) + + return { + "System": system_search.group(1).strip() if system_search else "", + "PCDF Reference": pcdf_search.group(1) if pcdf_search else "", + "Controls": controls_search.group(1).strip() if controls_search else "", + "% of Heat": int(heat_search.group(1)) if heat_search else 0, + } + + def extract_primary_heating(self, text): # Extract Primary Heating Section (Main Heating 1) primary_heating_section1 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL) @@ -228,61 +255,46 @@ class ElmhurstEprExtractor: primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 primary_text = primary_heating_section.group(1) - primary_heating_output = { - "Existing Primary Heating System": re.search( - r"Main Heating Code\s*(.*?)\n", primary_text - ).group(1).strip(), - "Existing Primary Heating PCDF Reference": re.search( - r"PCDF boiler Reference\s*(\d+)", primary_text - ).group(1), - "Existing Primary Heating Controls": re.search( - r"Main Heating Controls\s*(.*?)\n", primary_text - ).group(1).strip(), - "Existing Primary Heating % of Heat": int( - re.search(r"Percentage of Heat\s*(\d+)\s*%?", primary_text).group(1) - ) - } + return self._extract_heating_details(primary_text) - return primary_heating_output - - @staticmethod - def extract_secondary_heating(text): + def extract_secondary_heating(self, text): # Extract Secondary Heating Section (Main Heating 2) secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL) + output = {} if secondary_heating_section is None: - output["Existing Heating System"] = "" - output["Existing Heating PCDF Reference"] = "" - output["Existing Heating Controls"] = "" - output["Existing Heating % of Heat"] = 0 + + output["System"] = "" + output[" PCDF Reference"] = "" + output["Controls"] = "" + output["% of Heat"] = 0 else: secondary_text = secondary_heating_section.group(1) - - main_heating_code_match_secondary = re.search( - r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text + output.update( + **self._extract_heating_details(secondary_text) ) - output["Existing Heating System"] = main_heating_code_match_secondary.group(1).strip() - output["Existing Heating PCDF Reference"] = re.search( - r"PCDF boiler Reference\s*(\d+)", secondary_text - ).group(1) - - if output["Existing Heating System"] == "": - output["Existing Heating Controls"] = "" - else: - # Might not have heating controls on 2nd system - secondary_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) - output["Existing Heating Controls"] = ( - secondary_controls_match.group(1).strip() if secondary_controls_match else "" - ) - output["Existing Heating % of Heat"] = int( - re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1) - ) + output["Heating Code"] = ( + re.search(r"Secondary Heating Code\s*(.*?)\n", text).group(1).strip() + if output["System"] and re.search(r"Secondary Heating Code\s*(.*?)\n", text) + else "" + ) return output def extract(self): + """ + Extracts all relevant data from the EPR PDF. + + Returns: + dict: A dictionary containing extracted data, including: + - Address and Postcode + - SAP Rating and Primary Energy Use + - Lighting, Doors, Windows, Roof, and Wall Details + - Heating systems (Primary and Secondary) + - Building Parts + """ data = {} with open(self.file_path, "rb") as file: @@ -291,36 +303,56 @@ class ElmhurstEprExtractor: # Extracting individual components address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL) + if not address_match: + raise ValueError("Failed to extract address.") data["Address"] = address_match.group(1).strip() data["Postcode"] = data["Address"].split(",")[-1].strip() sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text) + if not sap_match: + raise ValueError("Failed to extract SAP rating.") data["Current SAP Rating"] = int(sap_match.group(1)) energy_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text) + if not energy_match: + raise ValueError("Failed to extract primary energy use.") data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(energy_match.group(1)) storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) + if not storeys_match: + raise ValueError("Failed to extract number of storeys.") data["Number of Storeys"] = int(storeys_match.group(1)) fuel_match = re.search(r"TOTAL\s*£(\d+)", text) + if not fuel_match: + raise ValueError("Failed to extract fuel bill.") data["Fuel Bill"] = f"£{fuel_match.group(1)}" total_doors_match = re.search(r"Total Doors:\s*(\d+)", text) + if not total_doors_match: + raise ValueError("Failed to extract total doors.") data["Total Number of Doors"] = int(total_doors_match.group(1)) # Extract Number of Insulated Doors insulated_doors_match = re.search(r"Insulated Doors:\s*(\d+)", text) + if not insulated_doors_match: + raise ValueError("Failed to extract insulated doors.") data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) # Get number of lighting outlets and number of fittings needing LEL lighting_fittings_match = re.search(r"Total number of light fittings\s*(\d+)", text) + if not lighting_fittings_match: + raise ValueError("Failed to extract lighting") data["Number of Light Fittings"] = int(lighting_fittings_match.group(1)) lel_fittings_match = re.search(r"Total number of L.E.L. fittings\s*(\d+)", text) + if not lel_fittings_match: + raise ValueError("Failed to extract LEL fittings.") data["Number of LEL Fittings"] = int(lel_fittings_match.group(1)) data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) + if not windows_section: + raise ValueError("Failed to extract window data.") data["Windows"] = self.extract_window_age_description(windows_section.group(1)) data["Primary Heating"] = self.extract_primary_heating(text) @@ -329,15 +361,9 @@ class ElmhurstEprExtractor: data["Roof Details"] = self.extract_roof_details(text) data["Wall Details"] = self.extract_wall_details(text) - secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) - - if data["Secondary Heating"]["Existing Heating System"] == "": - data["Secondary Heating Code"] = "" - else: - data["Secondary Heating Code"] = secondary_heating_code_match.group( - 1).strip() if secondary_heating_code_match else "" - + if not water_heating_code_match: + raise ValueError("Failed to extract water heating code.") data["Water Heating Code"] = water_heating_code_match.group(1).strip() return data From 749faaebca22c8353ab09965ddc9c087a6c5d0d0 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 27 Nov 2024 17:07:56 +0000 Subject: [PATCH 04/23] extending extraction --- etl/lodgement/app.py | 151 ++++++++++++++++- utils/file_data_extraction.py | 116 ++++++++++++- utils/fullSapParser.py | 306 ++++++++++++++++++++++++++++++++++ 3 files changed, 562 insertions(+), 11 deletions(-) create mode 100644 utils/fullSapParser.py diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index a395508c..b8b7e393 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -1,5 +1,83 @@ import os import utils.file_data_extraction as file_extraction_tools +from utils.fullSapParser import FullSapParser + +output_template = { + "Property Address": None, + "Osm. ID": None, + "Postcode": None, + "City/County": None, + "District/Town": None, + "Funding Stream": None, + "Local Authority": None, + "Trustmark Lodgement ID": None, + "Certificate Number": None, + "EWI UMR": None, + "Loft UMR": None, + "Windows UMR": None, + "Doors UMR": None, + "Measure Lodgement Date": None, + "Full Lodgement Date": None, + "Name": None, + "Phone": None, + "Email": None, + "Secondary Contact Name": None, + "Secondary Contact Phone": None, + "Trustmark Licence Number": None, + "Retrofit Assessment Date": None, + "Company Name": None, + "Retrofit Designer Name": None, + "Property Type": None, + "Property Detachment": None, + "No. of Bedrooms": None, + "Property Age": None, + "SAP Rating Pre (from IMA)": None, + "Pre Heat Transfer": None, + "Pre Total Floor Area": None, + "Pre Heat Demand": None, + "Pre Air Tightness": None, + "SAP Rating Post (from EPC)": None, + "Post Heat Transfer": None, + "Post Total Floor Area": None, + "Post Heat Demand": None, + "Post Air Tightness": None, + "Number of Eligible Measures Installed": None, + "Total Cost of Works": None, + "Annual Fuel Saving (MTP)": None, + "Work Type ID": None, + "Measure Category": None, + "Installer": None, + "Operative Name": None, + "Operative Certif. Reference": None, + "Manufacturer": None, + "Model": None, + "Financial Protection Body (IBG)": None, + "Policy Start Date": None, + "IBG Policy Reference": None, + "Warranty Duration": None, + "Total Invoiced (Including VAT)": None, + "Installation Date": None, + "Handover Date": None, + "Percentage": None, + "Reference Number": None, +} + + +def update_dictionary_with_check(dictionary, updates): + """ + Updates a dictionary with key-value pairs, raising an error if the key does not exist. + + Args: + dictionary (dict): The dictionary to update. + updates (dict): The updates to apply. + + Raises: + KeyError: If a key in updates does not exist in the dictionary. + """ + for key, value in updates.items(): + if key not in dictionary: + raise KeyError(f"Key '{key}' does not exist in the dictionary.") + dictionary[key] = value def handler(): @@ -11,7 +89,11 @@ def handler(): # Ths source data will eventually come from Sharepoint source_data_path = "/Users/khalimconn-kowlessar/Documents/hestia/Lodgment Pilot" - output_template = "Trustmark Details - Template REV.25.11.24.xlsx" + output_template_file = "Trustmark Details - Template REV.25.11.24.xlsx" + funding_stream = "HUG2" + customer_name = "Shropshire Council" + customer_phone = "0345 678 9000" + customer_email = "affordablewarmth@shropshire.gov.uk" # List the folders in the source data path folders = [x for x in os.listdir(source_data_path) if os.path.isdir(os.path.join(source_data_path, x))] @@ -20,7 +102,8 @@ def handler(): "elmhurst epr": file_extraction_tools.ElmhurstEprExtractor, "elmhurst summary report": None, "osmosis condition report": None, - "elmhurst evidence report": None + "elmhurst evidence report": None, + "full sap xml": FullSapParser, } for property_folder in folders: @@ -43,4 +126,68 @@ def handler(): file_extractor = extractors.get(report_type) if file_extractor is None: continue + extracted_contents[report_type] = file_extractor(filepath).extract() + + if file_extraction_tools.is_xml(filepath): + xml_type = file_extraction_tools.detect_xml_report_type(xml_path=filepath) + if xml_type is None: + raise ValueError(f"Unknown report type for {filename}") + file_extractor = extractors.get(xml_type) + if file_extractor is None: + continue + extracted_contents[xml_type] = file_extractor(filepath).extract() + + output_row_data = output_template.copy() + + # dict_keys([, , , 'City/County', 'District/Town', + # 'Local Authority', + # 'Trustmark Lodgement ID', + # 'Certificate Number', 'EWI UMR', 'Loft UMR', 'Windows UMR', + # 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date', 'Name', 'Phone', 'Email', 'Secondary Contact + # Name', 'Secondary Contact Phone', 'Trustmark Licence Number', 'Retrofit Assessment Date', 'Company Name', + # 'Retrofit Designer Name', , 'No. of Bedrooms', + # , 'Pre Heat Transfer', 'Pre Total Floor Area', 'Pre Heat Demand', + # 'Pre Air Tightness', 'SAP Rating Post (from EPC)', 'Post Heat Transfer', 'Post Total Floor Area', + # 'Post Heat Demand', 'Post Air Tightness', 'Number of Eligible Measures Installed', 'Total Cost of Works', + # 'Annual Fuel Saving (MTP)', 'Work Type ID', 'Measure Category', 'Installer', 'Operative Name', 'Operative + # Certif. Reference', 'Manufacturer', 'Model', 'Financial Protection Body (IBG)', 'Policy Start Date', + # 'IBG Policy Reference', 'Warranty Duration', 'Total Invoiced (Including VAT)', 'Installation Date', + # 'Handover Date', 'Percentage', 'Reference Number']) + # Populate the output row data + if extracted_contents["elmhurst epr"]: + total_floor_area = sum( + [x["Floor Area (m2)"] for x in extracted_contents["elmhurst epr"]["Building Parts"]] + + # Get the conservatory floor area + extracted_contents["elmhurst epr"]["Conservatory"]["Floor Area (m2)"] + ) + + to_insert = { + "Property Address": property_folder.split(")")[1].strip(), + "Osm. ID": property_folder.split(")")[0].strip().lstrip("(").strip(), + "Postcode": extracted_contents["elmhurst epr"]["Postcode"], + "City/County": None, + "District/Town": None, + "Funding Stream": funding_stream, + "Local Authority": None, + 'Property Age': extracted_contents["elmhurst epr"]["Property Age"], + 'SAP Rating Pre (from IMA)': extracted_contents["elmhurst epr"]["Current SAP Rating"], + 'Pre Heat Transfer': extracted_contents["elmhurst epr"][ + "Primary Energy Use Intensity (kWh/m2/yr)"] * total_floor_area, + } + + output_row_data["Property Address"] = property_folder.split(")")[1].strip() + output_row_data["Osm. ID"] = property_folder.split(")")[0].strip().lstrip("(").strip() + output_row_data["Postcode"] = extracted_contents["elmhurst epr"]["Postcode"] + output_row_data["City/County"] = () + output_row_data["Batch"] = () + output_row_data["Funding Stream"] = funding_stream + output_row_data["Risk Path"] = () + + if extracted_contents["full sap xml"]: + to_insert = { + "Property Type": extracted_contents["full sap xml"]["Property Type"], + "Property Detachment": extracted_contents["full sap xml"]["Built Form"], + "Property Age": extracted_contents["full sap xml"]["Age Band"], + + } diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index 15b183dc..f0d341c6 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -1,6 +1,10 @@ import PyPDF2 import re from collections import Counter +from utils.logger import setup_logger +from xml.dom.minidom import parseString + +logger = setup_logger() """ This script contains functions used to extract data from retrofit survey files, including EPRs, @@ -61,6 +65,25 @@ def detect_pdf_report_type(pdf_path): return None +def detect_xml_report_type(xml_path): + """ + Detects the type of XML report based on content or filename. + :param xml_path: String path to the XML file + :return: String type of the report ("full sap xml", or None) + """ + # Attempt to read the first page of the PDF to determine type + with open(xml_path, "r") as file: + contents = file.read() + + contents = parseString(contents) + product_tag_search = contents.getElementsByTagName("Product") + if product_tag_search: + if product_tag_search[0].firstChild.nodeValue == "Sap 2012 Desktop": + return "full sap xml" + + raise Exception("Not implemented") + + def is_pdf(filename): """ Determines if the provided filename is a PDF file. @@ -68,6 +91,13 @@ def is_pdf(filename): return filename.endswith(".pdf") +def is_xml(filename): + """ + Determines if the provided filename is an XML file. + """ + return filename.endswith(".xml") + + class ElmhurstEprExtractor: """ A utility class for extracting specific data from Elmhurst Energy Performance Reports (EPR). @@ -223,26 +253,82 @@ class ElmhurstEprExtractor: return wall_data @staticmethod - def _extract_heating_details(section_text): + def extract_conservatory(text): + """ + Extracts conservatory data from the provided text. + The section is located between "Conservatory" and "Doors". + + Args: + text (str): The full text of the EPR PDF. + + Returns: + dict: A dictionary with conservatory details: + - "Conservatory Present" + - "Conservatory Separated" + - "Conservatory Floor Area" + - "Conservatory Double Glazed" + - "Conservatory Glazed Perimeter" + - "Heated Conservatory Height" + """ + + conservatory_match = re.search(r"Conservatory\s*(.*?)\s*Doors", text, re.DOTALL) + if not conservatory_match: + logger.error("Failed to extract conservatory data.") + raise ValueError("Could not extract conservatory data.") + + conservatory_text = conservatory_match.group(1) + + # Check if conservatory is present + present_match = re.search(r"Conservatory Present:\s*(Yes|No)", conservatory_text) + + if not present_match or present_match.group(1).strip() == "No": + logger.info("Conservatory not present.") + return { + "Conservatory Present": "No", + "Conservatory Separated": "", + "Conservatory Floor Area": 0, + "Conservatory Double Glazed": "", + "Conservatory Glazed Perimeter": 0, + "Heated Conservatory Height": "", + } + + # Extract conservatory details + separated_match = re.search(r"Conservatory Separated:\s*(Yes|No)", conservatory_text) + floor_area_match = re.search(r"Conservatory Floor Area:\s*([\d.]+)", conservatory_text) + double_glazed_match = re.search(r"Conservatory Double Glazed:\s*(Yes|No)", conservatory_text) + glazed_perimeter_match = re.search(r"Conservatory Glazed Perimeter:\s*([\d.]+)", conservatory_text) + height_match = re.search(r"Heated Conservatory Height:\s*(.*?)(?=\n|$)", conservatory_text) + + return { + "Conservatory Present": "Yes", + "Conservatory Separated": separated_match.group(1).strip() if separated_match else "", + "Conservatory Floor Area": float(floor_area_match.group(1)) if floor_area_match else 0, + "Conservatory Double Glazed": double_glazed_match.group(1).strip() if double_glazed_match else "", + "Conservatory Glazed Perimeter": float(glazed_perimeter_match.group(1)) if glazed_perimeter_match else 0, + "Heated Conservatory Height": height_match.group(1).strip() if height_match else "", + } + + @staticmethod + def _extract_heating_details(section_text, default_value=""): """ Extracts heating details from a given section of text. Args: section_text (str): The section of text containing heating details. + default_value (str, optional): The default value to return for missing fields. Defaults to "". Returns: dict: A dictionary containing heating system details. """ - system_search = re.search(r"Main Heating Code\s*(.*?)\n", section_text) pcdf_search = re.search(r"PCDF boiler Reference\s*(\d+)", section_text) controls_search = re.search(r"Main Heating Controls\s*(.*?)\n", section_text) heat_search = re.search(r"Percentage of Heat\s*(\d+)\s*%?", section_text) return { - "System": system_search.group(1).strip() if system_search else "", - "PCDF Reference": pcdf_search.group(1) if pcdf_search else "", - "Controls": controls_search.group(1).strip() if controls_search else "", + "System": system_search.group(1).strip() if system_search else default_value, + "PCDF Reference": pcdf_search.group(1) if pcdf_search else default_value, + "Controls": controls_search.group(1).strip() if controls_search else default_value, "% of Heat": int(heat_search.group(1)) if heat_search else 0, } @@ -257,7 +343,7 @@ class ElmhurstEprExtractor: return self._extract_heating_details(primary_text) - def extract_secondary_heating(self, text): + def extract_secondary_heating_details(self, text): # Extract Secondary Heating Section (Main Heating 2) secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL) @@ -265,7 +351,7 @@ class ElmhurstEprExtractor: if secondary_heating_section is None: output["System"] = "" - output[" PCDF Reference"] = "" + output["PCDF Reference"] = "" output["Controls"] = "" output["% of Heat"] = 0 @@ -304,65 +390,77 @@ class ElmhurstEprExtractor: # Extracting individual components address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL) if not address_match: + logger.error("Failed to extract address.") raise ValueError("Failed to extract address.") data["Address"] = address_match.group(1).strip() data["Postcode"] = data["Address"].split(",")[-1].strip() sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text) if not sap_match: + logger.error("Failed to extract SAP rating.") raise ValueError("Failed to extract SAP rating.") data["Current SAP Rating"] = int(sap_match.group(1)) energy_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text) if not energy_match: + logger.error("Failed to extract primary energy use.") raise ValueError("Failed to extract primary energy use.") data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(energy_match.group(1)) storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) if not storeys_match: - raise ValueError("Failed to extract number of storeys.") + logger.error("Failed to extract the number of storeys.") + raise ValueError("Failed to extract the number of storeys.") data["Number of Storeys"] = int(storeys_match.group(1)) fuel_match = re.search(r"TOTAL\s*£(\d+)", text) if not fuel_match: + logger.error("Failed to extract fuel bill.") raise ValueError("Failed to extract fuel bill.") data["Fuel Bill"] = f"£{fuel_match.group(1)}" total_doors_match = re.search(r"Total Doors:\s*(\d+)", text) if not total_doors_match: + logger.error("Failed to extract total doors.") raise ValueError("Failed to extract total doors.") data["Total Number of Doors"] = int(total_doors_match.group(1)) # Extract Number of Insulated Doors insulated_doors_match = re.search(r"Insulated Doors:\s*(\d+)", text) if not insulated_doors_match: + logger.error("Failed to extract insulated doors.") raise ValueError("Failed to extract insulated doors.") data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) # Get number of lighting outlets and number of fittings needing LEL lighting_fittings_match = re.search(r"Total number of light fittings\s*(\d+)", text) if not lighting_fittings_match: + logger.error("Failed to extract lighting.") raise ValueError("Failed to extract lighting") data["Number of Light Fittings"] = int(lighting_fittings_match.group(1)) lel_fittings_match = re.search(r"Total number of L.E.L. fittings\s*(\d+)", text) if not lel_fittings_match: + logger.error("Failed to extract LEL fittings.") raise ValueError("Failed to extract LEL fittings.") data["Number of LEL Fittings"] = int(lel_fittings_match.group(1)) data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) if not windows_section: + logger.error("Failed to extract window data.") raise ValueError("Failed to extract window data.") data["Windows"] = self.extract_window_age_description(windows_section.group(1)) data["Primary Heating"] = self.extract_primary_heating(text) - data["Secondary Heating"] = self.extract_secondary_heating(text) + data["Secondary Heating"] = self.extract_secondary_heating_details(text) data["Building Parts"] = self.extract_building_parts(text) data["Roof Details"] = self.extract_roof_details(text) data["Wall Details"] = self.extract_wall_details(text) + data["Conservatory"] = self.extract_conservatory(text) water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) if not water_heating_code_match: + logger.error("Failed to extract water heating code.") raise ValueError("Failed to extract water heating code.") data["Water Heating Code"] = water_heating_code_match.group(1).strip() diff --git a/utils/fullSapParser.py b/utils/fullSapParser.py new file mode 100644 index 00000000..540eff6f --- /dev/null +++ b/utils/fullSapParser.py @@ -0,0 +1,306 @@ +import boto3 +from xml.dom.minidom import parseString + +PROPERTY_AGE_BAND = { + "A": "before 1900", + "B": "1900-1929", + "C": "1930-1949", + "D": "1950-1966", + "E": "1967-1975", + "F": "1976-1982", + "G": "1983-1990", + "H": "1991-1995", + "I": "1996-2002", + "J": "2003-2006", + "K": "2007-2011", + "L": "2012 onwards" +} + +POSITION_OF_FLAT = { + "TopFloorFlat": "(top floor)" +} + +MAINHEATING_LOOKUP = { + "SEB": "Electric (SEB modern slimline storage heaters)" +} + +WINDOWS_YEAR_LOOKUP = { + "unknown install date": "unknown year", + "unknown install": "unknown year", + "post or during 2002": "2002 onwards", +} + + +class FullSapParser: + full_address = None + archetype = None + age_band = None + unheated_corridor = None + property_type = None + built_form = None + + # ventilation + mechanical_ventilation = None + cross_ventilation = None + night_ventilation = None + + # dimensions + number_of_storeys = None + property_dimensions = None + + # fabric + low_energy_lighting = None + + # Heating + heating1 = None + cylinder = None + cylinder_stat = None + + def __init__(self, filekey, bucket_name=None): + self.s3_client = boto3.client('s3') + self.bucket_name = bucket_name + self.filekey = filekey + self.full_sap = None + + self._read_file() + + def _read_file(self): + """ + Reads the XML file either locally or from S3 and parses it using minidom. + + Raises: + ValueError: If the file cannot be found, read, or parsed. + """ + try: + if self.bucket_name: + # Read from S3 + response = self.s3_client.get_object(Bucket=self.bucket_name, Key=self.filekey) + xml_content = response['Body'].read() + else: + # Read locally + with open(self.filekey, "r") as f: + xml_content = f.read() + + # Parse the XML content using minidom + self.full_sap = parseString(xml_content) + except FileNotFoundError: + raise ValueError(f"Local file not found: {self.filekey}") + except Exception as e: + raise ValueError(f"An error occurred while reading or parsing the XML: {e}") + + def extract(self, _return=True): + self.get_address() + self.get_archetype() + self.get_age_band() + self.get_unheated_corridor() + self.get_heating_1() + self.get_ventilation() + self.get_floor_area() + self.get_low_energy_lighting() + self.get_cylinder() + + if _return: + return { + "Property Type": self.property_type, + "Built Form": self.built_form, + "Age Band": self.age_band, + } + + def get_address(self): + if not self.full_sap: + raise ValueError("You need to read the file first") + + address = self.full_sap.getElementsByTagName("AddressAsDesigned") + if len(address) != 1: + raise ValueError("Non-unique address tag found - investigate me") + + address = address[0] + data = {} + for node in address.childNodes: + if node.nodeType == node.ELEMENT_NODE: + data[node.nodeName] = node.firstChild.nodeValue if node.firstChild else None + + self.full_address = " ".join( + [ + x.title() for x in [data["AddressLine1"], data["AddressLine2"], data["AddressLine3"], data["Town"]] + if x is not None + ] + ) + " " + data["Postcode"] + + def get_archetype(self): + if not self.full_sap: + raise ValueError("You need to read the file first") + + property_type1 = self.full_sap.getElementsByTagName('PropertyType1') + property_type2 = self.full_sap.getElementsByTagName('PropertyType2') + position_of_flat = self.full_sap.getElementsByTagName('PositionOfFlat') + + if len(property_type1) != 1 or len(property_type2) != 1: + raise ValueError("Non-unique property tag found - investigate me") + + property_type1 = property_type1[0].firstChild.nodeValue + property_type2 = property_type2[0].firstChild.nodeValue + if position_of_flat[0].firstChild: + position_of_flat = POSITION_OF_FLAT[position_of_flat[0].firstChild.nodeValue] + else: + position_of_flat = None + + self.property_type = property_type1 + self.built_form = property_type2 + self.archetype = property_type1 + " - " + property_type2 + + if position_of_flat: + self.archetype = self.archetype + " " + position_of_flat + + def get_age_band(self): + if not self.full_sap: + raise ValueError("You need to read the file first") + + property_age_band = self.full_sap.getElementsByTagName('PropertyAgeBand') + + if len(property_age_band) != 1: + raise ValueError("Non-unique property age band tag found - investigate me") + + property_age_band = property_age_band[0].firstChild.nodeValue + self.age_band = PROPERTY_AGE_BAND[property_age_band] + + def get_wall_area_for_description(self, description): + wall_recs = self.full_sap.getElementsByTagName("WallRec") + for wall_rec in wall_recs: + desc_elements = wall_rec.getElementsByTagName("Description") + if desc_elements and desc_elements[0].firstChild.data == description: + area_elements = wall_rec.getElementsByTagName("Area") + if area_elements: + area = float(area_elements[0].firstChild.data) + # Placeholder for wall_description which you'll populate later + return f"Unheated corridor - {area} area" + return None + + def get_unheated_corridor(self): + """ + Unheated corridors don't always exist so we'll need to search for it + :return: + """ + + if not self.full_sap: + raise ValueError("You need to read the file first") + + self.unheated_corridor = self.get_wall_area_for_description("Flat corridor Main") + + def get_heating_1(self): + + if not self.full_sap: + raise ValueError("You need to read the file first") + + main_heating_system = self.full_sap.getElementsByTagName('MainHeatingSystem1') + + if len(main_heating_system) != 1: + raise ValueError("Non-unique main heating system tag found - investigate me") + + main_heating_system = main_heating_system[0] + + mhs = main_heating_system.getElementsByTagName('MHS')[0].firstChild.nodeValue + mhs = MAINHEATING_LOOKUP.get(mhs, mhs) + + fraction = main_heating_system.getElementsByTagName('Fraction')[0].firstChild.nodeValue + + self.heating1 = f"{mhs} : {fraction}% of heating" + + def get_ventilation(self): + + bool_lookup = { + "true": True, + "false": False + } + + # Extract MechanicalVentilationDecentralised + mech_vent = self.full_sap.getElementsByTagName("MechanicalVentilationDecentralised") + if mech_vent and mech_vent[0].childNodes: + mech_vent_value = mech_vent[0].firstChild.nodeValue + else: + mech_vent_value = None + + # Extract CrossVentilation + cross_vent = self.full_sap.getElementsByTagName("CrossVentilation") + if cross_vent and cross_vent[0].childNodes: + cross_vent_value = cross_vent[0].firstChild.nodeValue + cross_vent_value = bool_lookup.get(cross_vent_value, cross_vent_value) + else: + cross_vent_value = None + + # Extract NightVentilation + night_vent = self.full_sap.getElementsByTagName("NightVentilation") + if night_vent and night_vent[0].childNodes: + night_vent_value = night_vent[0].firstChild.nodeValue + night_vent_value = bool_lookup.get(night_vent_value, night_vent_value) + else: + night_vent_value = None + + # Create the outputs + self.mechanical_ventilation = "Mechanical ventilation present" if mech_vent_value else "No mechanical " \ + "ventilation" + self.cross_ventilation = "Cross ventilation present" if cross_vent_value else "No cross ventilation" + self.night_ventilation = "Night ventilation present" if night_vent_value else "No night ventilation" + + def get_floor_area(self): + + self.number_of_storeys = int(self.full_sap.getElementsByTagName('NumberOfStoreys')[0].firstChild.nodeValue) + storeys = self.full_sap.getElementsByTagName('StoreyMeasurementRec') + + # TODO: The first StoreyMeasurementRec tag looks like this in the examples we've seen: + # + # Indicating that the tag is explicitly indicated as empty + + storey_data = [] + storey_index = -1 + for storey in storeys: + storey_index += 1 + + if storey.getAttribute("xsi:nil") == "true": + continue + + if storey_index == -1: + raise NotImplementedError( + "Investigated me - potentially basement found but need to confirm with Basement tag" + ) + + floor_area = storey.getElementsByTagName('InternalFloorArea') + if not floor_area: + continue + + floor_area = float(floor_area[0].firstChild.nodeValue) + # If floor area is 0, skip this storey + if not floor_area: + continue + + perimeter = float(storey.getElementsByTagName('InternalPerimeter')[0].firstChild.nodeValue) + height = float(storey.getElementsByTagName('StoreyHeight')[0].firstChild.nodeValue) + + storey_data.append({ + "storey_index": storey_index, + "Floor Area": floor_area, + "Perimeter": perimeter, + "Height": height + }) + + # We will convert this into a table in the markdown + self.property_dimensions = storey_data + + def get_low_energy_lighting(self): + # Extract the values of the LightFittings and LELFittings tags + light_fittings = self.full_sap.getElementsByTagName('LightFittings')[0].firstChild.data + lel_fittings = self.full_sap.getElementsByTagName('LELFittings')[0].firstChild.data + + # Construct the string message + self.low_energy_lighting = f"{lel_fittings} out of {light_fittings} lighting fittings are low energy." + + def get_cylinder(self): + insulation_type = self.full_sap.getElementsByTagName('InsulationType')[0].firstChild.data + insulation_thickness = self.full_sap.getElementsByTagName('InsulationThickness')[0].firstChild.data + + if insulation_type and insulation_thickness: + self.cylinder = f"Insulated, {insulation_type}: {insulation_thickness}mm." + else: + self.cylinder = "Not insulated." + + self.cylinder_stat = self.full_sap.getElementsByTagName('CylinderStat')[0].firstChild.data From 63521dd1e303cae0daa1fdf8e405d7e8c953a1da Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 27 Nov 2024 17:18:17 +0000 Subject: [PATCH 05/23] extending extraction --- etl/lodgement/app.py | 21 +++++++++++++++------ utils/file_data_extraction.py | 2 ++ 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index b8b7e393..4ff8bdf1 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -126,7 +126,7 @@ def handler(): file_extractor = extractors.get(report_type) if file_extractor is None: continue - + extracted_contents[report_type] = file_extractor(filepath).extract() if file_extraction_tools.is_xml(filepath): @@ -136,6 +136,7 @@ def handler(): file_extractor = extractors.get(xml_type) if file_extractor is None: continue + extracted_contents[xml_type] = file_extractor(filepath).extract() output_row_data = output_template.copy() @@ -144,10 +145,12 @@ def handler(): # 'Local Authority', # 'Trustmark Lodgement ID', # 'Certificate Number', 'EWI UMR', 'Loft UMR', 'Windows UMR', - # 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date', 'Name', 'Phone', 'Email', 'Secondary Contact + # 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date', + # 'Name', 'Phone', 'Email', (owner) + # 'Secondary Contact # Name', 'Secondary Contact Phone', 'Trustmark Licence Number', 'Retrofit Assessment Date', 'Company Name', # 'Retrofit Designer Name', , 'No. of Bedrooms', - # , 'Pre Heat Transfer', 'Pre Total Floor Area', 'Pre Heat Demand', + # , # 'Pre Air Tightness', 'SAP Rating Post (from EPC)', 'Post Heat Transfer', 'Post Total Floor Area', # 'Post Heat Demand', 'Post Air Tightness', 'Number of Eligible Measures Installed', 'Total Cost of Works', # 'Annual Fuel Saving (MTP)', 'Work Type ID', 'Measure Category', 'Installer', 'Operative Name', 'Operative @@ -159,7 +162,12 @@ def handler(): total_floor_area = sum( [x["Floor Area (m2)"] for x in extracted_contents["elmhurst epr"]["Building Parts"]] + # Get the conservatory floor area - extracted_contents["elmhurst epr"]["Conservatory"]["Floor Area (m2)"] + [extracted_contents["elmhurst epr"]["Conservatory"]["Conservatory Floor Area"]] + ) + + pre_heat_transfer = extracted_contents["elmhurst epr"]["Primary Energy Use Intensity (kWh/m2/yr)"] + pre_heat_demand = ( + extracted_contents["elmhurst epr"]["Primary Energy Use Intensity (kWh/m2/yr)"] * total_floor_area ) to_insert = { @@ -172,8 +180,9 @@ def handler(): "Local Authority": None, 'Property Age': extracted_contents["elmhurst epr"]["Property Age"], 'SAP Rating Pre (from IMA)': extracted_contents["elmhurst epr"]["Current SAP Rating"], - 'Pre Heat Transfer': extracted_contents["elmhurst epr"][ - "Primary Energy Use Intensity (kWh/m2/yr)"] * total_floor_area, + 'Pre Heat Transfer': pre_heat_transfer, + 'Pre Total Floor Area': total_floor_area, + 'Pre Heat Demand': pre_heat_demand, } output_row_data["Property Address"] = property_folder.split(")")[1].strip() diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index f0d341c6..ae75735b 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -387,6 +387,8 @@ class ElmhurstEprExtractor: reader = PyPDF2.PdfReader(file) text = "".join(page.extract_text() for page in reader.pages) + data["Assessor Name"] = re.search(r"Created by:\s*(.*?)\n", text).group(1).strip() + # Extracting individual components address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL) if not address_match: From bcbb43ed8f045e53607cd17e1b4cff4709208cf9 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 08:38:38 +0000 Subject: [PATCH 06/23] adding the summary report extraction class --- etl/lodgement/app.py | 128 +++++++++-------- etl/lodgement/requirements.txt | 4 +- utils/OsmosisCondtionReportParser.py | 49 +++++++ utils/file_data_extraction.py | 196 ++++++++++++++++++++++++++- 4 files changed, 315 insertions(+), 62 deletions(-) create mode 100644 utils/OsmosisCondtionReportParser.py diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index 4ff8bdf1..3688ca19 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -1,6 +1,10 @@ import os + +import pandas as pd + import utils.file_data_extraction as file_extraction_tools from utils.fullSapParser import FullSapParser +from utils.OsmosisCondtionReportParser import OsmosisConditionReportParser output_template = { "Property Address": None, @@ -9,6 +13,7 @@ output_template = { "City/County": None, "District/Town": None, "Funding Stream": None, + # "Risk Path": None, "Local Authority": None, "Trustmark Lodgement ID": None, "Certificate Number": None, @@ -18,11 +23,12 @@ output_template = { "Doors UMR": None, "Measure Lodgement Date": None, "Full Lodgement Date": None, - "Name": None, - "Phone": None, - "Email": None, - "Secondary Contact Name": None, - "Secondary Contact Phone": None, + "Owner - Name": None, + "Owner - Phone": None, + "Owner - Email": None, + "Tenant - Name": None, + "Tenant - Phone": None, + "R. Assessor - Name": None, "Trustmark Licence Number": None, "Retrofit Assessment Date": None, "Company Name": None, @@ -30,7 +36,7 @@ output_template = { "Property Type": None, "Property Detachment": None, "No. of Bedrooms": None, - "Property Age": None, + "Property age": None, "SAP Rating Pre (from IMA)": None, "Pre Heat Transfer": None, "Pre Total Floor Area": None, @@ -44,22 +50,6 @@ output_template = { "Number of Eligible Measures Installed": None, "Total Cost of Works": None, "Annual Fuel Saving (MTP)": None, - "Work Type ID": None, - "Measure Category": None, - "Installer": None, - "Operative Name": None, - "Operative Certif. Reference": None, - "Manufacturer": None, - "Model": None, - "Financial Protection Body (IBG)": None, - "Policy Start Date": None, - "IBG Policy Reference": None, - "Warranty Duration": None, - "Total Invoiced (Including VAT)": None, - "Installation Date": None, - "Handover Date": None, - "Percentage": None, - "Reference Number": None, } @@ -100,14 +90,19 @@ def handler(): extractors = { "elmhurst epr": file_extraction_tools.ElmhurstEprExtractor, - "elmhurst summary report": None, - "osmosis condition report": None, + "elmhurst summary report": file_extraction_tools.ElmhurstSummaryReportExtractor, + "osmosis condition report": OsmosisConditionReportParser, "elmhurst evidence report": None, "full sap xml": FullSapParser, } + extracted = [] for property_folder in folders: + coordinator_folder = os.path.join(source_data_path, property_folder, "2. RA Coordinator Info") + # Check if this folder exists + if not os.path.exists(coordinator_folder): + coordinator_folder = os.path.join(source_data_path, property_folder, "1. RA Coordinator Info") # Get the contents of the folder coordinator_folder_contents = [ @@ -123,10 +118,10 @@ def handler(): if report_type is None: raise ValueError(f"Unknown report type for {filename}") - file_extractor = extractors.get(report_type) + file_extractor = extractors[report_type] if file_extractor is None: continue - + extracted_contents[report_type] = file_extractor(filepath).extract() if file_extraction_tools.is_xml(filepath): @@ -141,24 +136,27 @@ def handler(): output_row_data = output_template.copy() - # dict_keys([, , , 'City/County', 'District/Town', - # 'Local Authority', - # 'Trustmark Lodgement ID', - # 'Certificate Number', 'EWI UMR', 'Loft UMR', 'Windows UMR', - # 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date', - # 'Name', 'Phone', 'Email', (owner) - # 'Secondary Contact - # Name', 'Secondary Contact Phone', 'Trustmark Licence Number', 'Retrofit Assessment Date', 'Company Name', - # 'Retrofit Designer Name', , 'No. of Bedrooms', - # , - # 'Pre Air Tightness', 'SAP Rating Post (from EPC)', 'Post Heat Transfer', 'Post Total Floor Area', - # 'Post Heat Demand', 'Post Air Tightness', 'Number of Eligible Measures Installed', 'Total Cost of Works', - # 'Annual Fuel Saving (MTP)', 'Work Type ID', 'Measure Category', 'Installer', 'Operative Name', 'Operative - # Certif. Reference', 'Manufacturer', 'Model', 'Financial Protection Body (IBG)', 'Policy Start Date', - # 'IBG Policy Reference', 'Warranty Duration', 'Total Invoiced (Including VAT)', 'Installation Date', - # 'Handover Date', 'Percentage', 'Reference Number']) + # dict_keys([ 'City/County', 'District/Town', + # 'Local Authority', 'Trustmark Lodgement ID', 'Certificate Number', 'EWI UMR', 'Loft UMR', 'Windows UMR', + # 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date', 'Owner - Name', 'Owner - Phone', + # 'Owner - Email', 'Tenant - Name', 'Tenant - Phone', + # 'Trustmark Licence Number', + # 'Company Name', 'Retrofit Designer Name', + # Pre Air Tightness', 'SAP Rating Post (from EPC)', 'Post Heat + # Transfer', 'Post Total Floor Area', 'Post Heat Demand', 'Post Air Tightness', 'Number of Eligible Measures + # Installed', 'Total Cost of Works', 'Annual Fuel Saving (MTP)']) # Populate the output row data - if extracted_contents["elmhurst epr"]: + + update_dictionary_with_check( + output_row_data, + { + "Funding Stream": funding_stream, + "Property Address": property_folder.split(")")[1].strip(), + "Osm. ID": property_folder.split(")")[0].strip().lstrip("(").strip(), + } + ) + + if extracted_contents.get("elmhurst epr"): total_floor_area = sum( [x["Floor Area (m2)"] for x in extracted_contents["elmhurst epr"]["Building Parts"]] + # Get the conservatory floor area @@ -170,33 +168,45 @@ def handler(): extracted_contents["elmhurst epr"]["Primary Energy Use Intensity (kWh/m2/yr)"] * total_floor_area ) - to_insert = { - "Property Address": property_folder.split(")")[1].strip(), - "Osm. ID": property_folder.split(")")[0].strip().lstrip("(").strip(), + epr_to_insert = { "Postcode": extracted_contents["elmhurst epr"]["Postcode"], "City/County": None, "District/Town": None, - "Funding Stream": funding_stream, "Local Authority": None, - 'Property Age': extracted_contents["elmhurst epr"]["Property Age"], 'SAP Rating Pre (from IMA)': extracted_contents["elmhurst epr"]["Current SAP Rating"], 'Pre Heat Transfer': pre_heat_transfer, 'Pre Total Floor Area': total_floor_area, 'Pre Heat Demand': pre_heat_demand, + "R. Assessor - Name": extracted_contents["elmhurst epr"]["Assessor Name"], + "Retrofit Assessment Date": extracted_contents["elmhurst epr"]["Assessment Date"], } + update_dictionary_with_check( + output_row_data, + epr_to_insert + ) - output_row_data["Property Address"] = property_folder.split(")")[1].strip() - output_row_data["Osm. ID"] = property_folder.split(")")[0].strip().lstrip("(").strip() - output_row_data["Postcode"] = extracted_contents["elmhurst epr"]["Postcode"] - output_row_data["City/County"] = () - output_row_data["Batch"] = () - output_row_data["Funding Stream"] = funding_stream - output_row_data["Risk Path"] = () - - if extracted_contents["full sap xml"]: - to_insert = { + if extracted_contents.get("full sap xml"): + xml_to_insert = { "Property Type": extracted_contents["full sap xml"]["Property Type"], "Property Detachment": extracted_contents["full sap xml"]["Built Form"], - "Property Age": extracted_contents["full sap xml"]["Age Band"], + "Property age": extracted_contents["full sap xml"]["Age Band"], } + update_dictionary_with_check( + output_row_data, + xml_to_insert + ) + + if extracted_contents.get("osmosis condition report"): + cr_to_insert = { + "No. of Bedrooms": extracted_contents["osmosis condition report"]["No. of Bedrooms"], + # "Risk Path": extracted_contents["osmosis condition report"]["Risk Assessment Pathway"], + } + update_dictionary_with_check( + output_row_data, + cr_to_insert + ) + + extracted.append(output_row_data) + + extracted_df = pd.DataFrame(extracted) diff --git a/etl/lodgement/requirements.txt b/etl/lodgement/requirements.txt index 601907ed..75c63b26 100644 --- a/etl/lodgement/requirements.txt +++ b/etl/lodgement/requirements.txt @@ -5,4 +5,6 @@ openpyxl boto3 usaddress==0.5.11 fuzzywuzzy==0.18.0 -python-dotenv \ No newline at end of file +python-dotenv +python-docx +pymupdf diff --git a/utils/OsmosisCondtionReportParser.py b/utils/OsmosisCondtionReportParser.py new file mode 100644 index 00000000..4d8873a2 --- /dev/null +++ b/utils/OsmosisCondtionReportParser.py @@ -0,0 +1,49 @@ +import re +import boto3 +import PyPDF2 +import fitz + + +class OsmosisConditionReportParser: + + def __init__(self, filekey, bucket_name=None): + self.s3_client = boto3.client('s3') + self.bucket_name = bucket_name + self.filekey = filekey + self.pdf_text = None + + self._read_file() + + def _read_file(self): + """ + Reads the XML file either locally or from S3 and parses it using minidom. + + Raises: + ValueError: If the file cannot be found, read, or parsed. + """ + + chunk_size = 10 + + try: + if self.bucket_name: + # Read from S3 + raise NotImplementedError("Imeplement me") + else: + + with fitz.open(self.filekey) as pdf: + text = "" + for page in pdf: + text += page.get_text() + + # Parse the XML content using minidom + self.pdf_text = text + except FileNotFoundError: + raise ValueError(f"Local file not found: {self.filekey}") + except Exception as e: + raise ValueError(f"An error occurred while reading or parsing the XML: {e}") + + def extract(self): + return { + "No. of Bedrooms": int(re.search(r"No\. of Bedrooms \(Total\)\s*(\d+)", self.pdf_text).group(1)), + "Risk Assessment Pathway": re.search(r"Risk\s*Assessment\s*Pathway\s*([A-Z])", self.pdf_text).group(1) + } diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index ae75735b..2337ea9d 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -100,8 +100,8 @@ def is_xml(filename): class ElmhurstEprExtractor: """ - A utility class for extracting specific data from Elmhurst Energy Performance Reports (EPR). - """ + A utility class for extracting specific data from Elmhurst Energy Performance Reports (EPR). + """ def __init__(self, file_path): self.file_path = file_path @@ -388,6 +388,7 @@ class ElmhurstEprExtractor: text = "".join(page.extract_text() for page in reader.pages) data["Assessor Name"] = re.search(r"Created by:\s*(.*?)\n", text).group(1).strip() + data["Assessment Date"] = re.search(r"\nAssessment Date\s*(.*?)\n", text).group(1).strip() # Extracting individual components address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL) @@ -467,3 +468,194 @@ class ElmhurstEprExtractor: data["Water Heating Code"] = water_heating_code_match.group(1).strip() return data + + +class ElmhurstSummaryReportExtractor: + """ + A utility class for extracting specific data from Elmhurst Energy Performance Reports (EPR). + """ + + def __init__(self, file_path): + self.file_path = file_path + + def extract(self): + """ + Extracts specific data from the provided PDF file. + Data includes: + - Current SAP rating + - Fuel Bill + - Address + """ + + # Expected keys: + # dict_keys(['Total Number of Doors', 'Number of Insulated + # Doors', 'Number of Light Fittings', 'Number of LEL Fittings', 'Number of fittings needing LEL', 'Windows', + # 'Primary Heating', 'Secondary Heating', 'Building Parts', 'Roof Details', 'Wall Details', 'Conservatory', + # 'Water Heating Code']) + + data = { + + } + + with (open(self.file_path, "rb") as file): + reader = PyPDF2.PdfReader(file) + text = "" + for page in reader.pages: + text += page.extract_text() + + # Match and extract + name_match = re.search(r"Name:\s*([A-Za-z\s]+)\s*Title:\s*([A-Za-z\.]+)", text) + if not name_match: + raise ValueError("Couldn't extract surveyor name") + data["Assessor Name"] = name_match.group(2).strip() + " " + name_match.group(1).strip() + data["Assessment Date"] = re.search(r"Inspection Date:\s*(.*?)\n", text).group(1).strip() + + # Address and postcode + postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text) + region = re.search(r"Region:\s*(.*?)\nHouse Name:", text) + house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text) + house_no = re.search(r"House No:\s*(.*?)\nStreet:", text) + street = re.search(r"Street:\s*(.*?)\nLocality:", text) + locality = re.search(r"Locality:\s*(.*?)\nTown:", text) + town = re.search(r"Town:\s*(.*?)\nCounty:", text) + county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text) + + # Clean extracted values and remove any prefixes + address_parts = [ + house_no.group(1).strip() if house_no else "", + house_name.group(1).strip() if house_name else "", + street.group(1).strip() if street else "", + locality.group(1).strip() if locality else "", + town.group(1).strip() if town else "", + county.group(1).strip() if county else "", + region.group(1).strip() if region else "", + postcode.group(1).strip() if postcode else "" + ] + + # Join non-empty parts with a comma + data["Address"] = ", ".join([part for part in address_parts if part]) + data["Postcode"] = postcode.group(1).strip() + + # Extract Current SAP rating + sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) + if not sap_match: + raise ValueError("Could not extract SAP rating") + data["Current SAP Rating"] = sap_match.group(1).split(" ")[1] + + # We don't have primary energy in the summary report + data['Primary Energy Use Intensity (kWh/m2/yr)'] = None + + # Number of storeys + storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) + if not storeys_match: + raise ValueError("Could not extract number of storeys") + data["Number of Storeys"] = int(storeys_match.group(1)) + + # Extract Fuel Bill + fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text) + if not fuel_bill_match: + raise ValueError("Could not extract fuel bill") + data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" + + # Extract Total Number of Doors + total_doors_match = re.search(r"Total Number of Doors\s*(\d+)", text) + if not total_doors_match: + raise ValueError("Could not extract total number of doors") + data["Total Number of Doors"] = int(total_doors_match.group(1)) + + # Extract Number of Insulated Doors + insulated_doors_match = re.search(r"Number of Insulated Doors\s*(\d+)", text) + if not insulated_doors_match: + raise ValueError("Could not extract number of insulated doors") + data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) + + windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) + windows_text = windows_section.group(1) + window_data = extract_window_age_description(windows_text) + data.update(window_data) + + # Extract heating system + # Extract Primary Heating Data + # Extract Primary Heating Section + primary_heating_section1 = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL) + primary_heating_section2 = re.search(r"Main\s*Heating1\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) + primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 + + primary_text = primary_heating_section.group(1) + + data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group( + 1).strip() + data["Existing Primary Heating PCDF Reference"] = re.search( + r"PCDF boiler Reference\s*(\d+)", primary_text + ).group(1) + data["Existing Primary Heating Controls"] = re.search( + r"Main Heating Controls\s*(.*?)\n", primary_text + ).group(1).strip() + data["Existing Primary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1) + ) + + # Extract Secondary Heating Section + secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) + + if secondary_heating_section is None: + data["Existing Secondary Heating System"] = "" + data["Existing Secondary Heating PCDF Reference"] = "" + data["Existing Secondary Heating Controls"] = "" + data["Existing Secondary Heating % of Heat"] = 0 + + else: + secondary_text = secondary_heating_section.group(1) + + main_heating_code_match_secondary = re.search( + r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text + ) + data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip() + data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", + secondary_text).group(1) + second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) + data["Existing Secondary Heating Controls"] = ( + second_heating_controls_match.group(1).strip() if second_heating_controls_match else "" + ) + data["Existing Secondary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) + ) + + # Extract Secondary Heating and Water Heating Codes + secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) + water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) + + if data["Existing Secondary Heating System"] == "": + data["Secondary Heating Code"] = "" + else: + data["Secondary Heating Code"] = secondary_heating_code_match.group( + 1).strip() if secondary_heating_code_match else "" + + data["Water Heating Code"] = water_heating_code_match.group(1).strip() + + dimensions = extract_building_parts_summary(text) + data.update(dimensions) + + data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1)) + data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1)) + data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + + extracted_roof_data = extract_roof_details_summary(text) + main_roof_data = [roof for roof in extracted_roof_data if "Main" in roof["Building Part"]][0] + data["Main Roof Type"] = main_roof_data["Roof Type"] + data["Main Roof Insulation"] = main_roof_data["Roof Insulation"] + data["Main Roof Insulation Thickness"] = main_roof_data["Roof Insulation Thickness"] + + walls_data = extract_wall_details_summary(text) + # Get the main building wall data + main_building_walls = [wall for wall in walls_data if "Main" in wall["Building Part"]][0] + data["Main Wall Type"] = main_building_walls["Wall Type"] + data["Main Wall Insulation"] = main_building_walls["Wall Insulation"] + data["Main Wall Dry-lining"] = main_building_walls["Wall Dry-lining"] + data["Main Wall Thickness"] = main_building_walls["Wall Thickness (mm)"] + data["Main Building Alternative Wall Type"] = main_building_walls["Alternative Wall Type"] + data["Main Building Alternative Wall Insulation"] = main_building_walls["Alternative Wall Insulation"] + data["Main Building Alternative Wall Dry-lining"] = main_building_walls["Alternative Wall Dry-lining"] + data["Main Building Alternative Wall Thickness"] = main_building_walls["Alternative Wall Thickness (mm)"] + + return data From f141aa4d842a38d8133bdf9b586224333f5372be Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 08:42:53 +0000 Subject: [PATCH 07/23] extracting windows --- utils/file_data_extraction.py | 71 ++++++++++++++++++++++++++++++----- 1 file changed, 62 insertions(+), 9 deletions(-) diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index 2337ea9d..d444bff8 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -478,6 +478,59 @@ class ElmhurstSummaryReportExtractor: def __init__(self, file_path): self.file_path = file_path + @staticmethod + def extract_window_age_description(windows_text): + """ + Extracts the most common window age description and its proportion. + + Parameters: + windows_text (str): The text section containing window data. + + Returns: + dict: A dictionary with the most common window age description and its proportion. + """ + # Clean up windows_text by removing line breaks for better pattern matching + windows_text = windows_text.replace("\n", "") + + # Define possible window age descriptions + window_descriptions = [ + "Double post or during 2002", + "Double pre 2002", + "Double with unknown install date", + "Secondary glazing", + "Triple glazing", + "Single glazing", + ] + + # Count occurrences of each description + description_counts = Counter() + for description in window_descriptions: + matches = re.findall(re.escape(description), windows_text) + description_counts[description] = len(matches) + + if not description_counts or not sum(description_counts.values()): + raise ValueError("Failed to extract window data.") + + # Determine the most common description and calculate its proportion + most_common_description, window_count = description_counts.most_common(1)[0] + window_proportion = window_count / sum(description_counts.values()) * 100 + + # Get the second most common and the proportion + if window_proportion == 100: + second_most_common_description = None + second_most_common_proportion = 0 + else: + second_most_common_description, second_window_count = description_counts.most_common(2)[1] + second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100 + + return { + "Window Age Description": most_common_description, + "Window Age Description Proportion (%)": window_proportion, + "Secondary Window Age Description": second_most_common_description, + "Secondary Window Age Description Proportion (%)": second_most_common_proportion, + "Number of Windows": sum(description_counts.values()) + } + def extract(self): """ Extracts specific data from the provided PDF file. @@ -488,8 +541,7 @@ class ElmhurstSummaryReportExtractor: """ # Expected keys: - # dict_keys(['Total Number of Doors', 'Number of Insulated - # Doors', 'Number of Light Fittings', 'Number of LEL Fittings', 'Number of fittings needing LEL', 'Windows', + # dict_keys(['Windows', # 'Primary Heating', 'Secondary Heating', 'Building Parts', 'Roof Details', 'Wall Details', 'Conservatory', # 'Water Heating Code']) @@ -569,10 +621,15 @@ class ElmhurstSummaryReportExtractor: raise ValueError("Could not extract number of insulated doors") data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) + # lighting + data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1)) + data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1)) + data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) - windows_text = windows_section.group(1) - window_data = extract_window_age_description(windows_text) - data.update(window_data) + if not windows_section: + raise ValueError("Failed to extract window data.") + data["Windows"] = self.extract_window_age_description(windows_section.group(1)) # Extract heating system # Extract Primary Heating Data @@ -636,10 +693,6 @@ class ElmhurstSummaryReportExtractor: dimensions = extract_building_parts_summary(text) data.update(dimensions) - data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1)) - data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1)) - data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] - extracted_roof_data = extract_roof_details_summary(text) main_roof_data = [roof for roof in extracted_roof_data if "Main" in roof["Building Part"]][0] data["Main Roof Type"] = main_roof_data["Roof Type"] From d489b4346fd6e1f940de4fb1f61ca6cd6b10cf24 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 09:14:38 +0000 Subject: [PATCH 08/23] extracting secondary heating --- utils/file_data_extraction.py | 116 ++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 55 deletions(-) diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index d444bff8..20590afd 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -531,6 +531,62 @@ class ElmhurstSummaryReportExtractor: "Number of Windows": sum(description_counts.values()) } + @staticmethod + def extract_primary_heating(text): + primary_heating_section1 = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL) + primary_heating_section2 = re.search(r"Main\s*Heating1\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) + primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 + if primary_heating_section is None: + raise ValueError("Failed to extract primary heating data.") + + primary_text = primary_heating_section.group(1) + + output = { + 'System': re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group(1).strip(), + 'PCDF Reference': re.search(r"PCDF boiler Reference\s*(\d+)", primary_text).group(1), + 'Controls': re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group(1).strip(), + '% of Heat': int(re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1)) + } + return output + + @staticmethod + def extract_secondary_heating_details(text): + secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) + + # Defaults + output = { + "System": "", + "PCDF Reference": "", + "Controls": "", + "% of Heat": 0, + "Heating Code": "" + } + if secondary_heating_section is not None: + # Overwrite defaults + secondary_text = secondary_heating_section.group(1) + + main_heating_code_match_secondary = re.search( + r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text + ) + output["System"] = main_heating_code_match_secondary.group(1).strip() + output["PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", secondary_text).group(1) + + second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) + output["Heating Controls"] = ( + second_heating_controls_match.group(1).strip() if second_heating_controls_match else "" + ) + output["% of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) + ) + + secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) + if output["System"] != "": + output["Heating Code"] = ( + secondary_heating_code_match.group(1).strip() if secondary_heating_code_match else "" + ) + + return output + def extract(self): """ Extracts specific data from the provided PDF file. @@ -541,13 +597,11 @@ class ElmhurstSummaryReportExtractor: """ # Expected keys: - # dict_keys(['Windows', + # dict_keys([ # 'Primary Heating', 'Secondary Heating', 'Building Parts', 'Roof Details', 'Wall Details', 'Conservatory', # 'Water Heating Code']) - data = { - - } + data = {} with (open(self.file_path, "rb") as file): reader = PyPDF2.PdfReader(file) @@ -631,62 +685,14 @@ class ElmhurstSummaryReportExtractor: raise ValueError("Failed to extract window data.") data["Windows"] = self.extract_window_age_description(windows_section.group(1)) - # Extract heating system - # Extract Primary Heating Data - # Extract Primary Heating Section - primary_heating_section1 = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL) - primary_heating_section2 = re.search(r"Main\s*Heating1\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) - primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 - - primary_text = primary_heating_section.group(1) - - data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group( - 1).strip() - data["Existing Primary Heating PCDF Reference"] = re.search( - r"PCDF boiler Reference\s*(\d+)", primary_text - ).group(1) - data["Existing Primary Heating Controls"] = re.search( - r"Main Heating Controls\s*(.*?)\n", primary_text - ).group(1).strip() - data["Existing Primary Heating % of Heat"] = int( - re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1) - ) + data["Primary Heating"] = self.extract_primary_heating(text) + data["Secondary Heating"] = self.extract_secondary_heating_details(text) # Extract Secondary Heating Section - secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) - - if secondary_heating_section is None: - data["Existing Secondary Heating System"] = "" - data["Existing Secondary Heating PCDF Reference"] = "" - data["Existing Secondary Heating Controls"] = "" - data["Existing Secondary Heating % of Heat"] = 0 - - else: - secondary_text = secondary_heating_section.group(1) - - main_heating_code_match_secondary = re.search( - r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text - ) - data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip() - data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", - secondary_text).group(1) - second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) - data["Existing Secondary Heating Controls"] = ( - second_heating_controls_match.group(1).strip() if second_heating_controls_match else "" - ) - data["Existing Secondary Heating % of Heat"] = int( - re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) - ) # Extract Secondary Heating and Water Heating Codes - secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) - water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) - if data["Existing Secondary Heating System"] == "": - data["Secondary Heating Code"] = "" - else: - data["Secondary Heating Code"] = secondary_heating_code_match.group( - 1).strip() if secondary_heating_code_match else "" + water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) data["Water Heating Code"] = water_heating_code_match.group(1).strip() From 26e0206f378c4bedc8bca9e42d43e5d7bfcc196f Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 09:20:44 +0000 Subject: [PATCH 09/23] extracted roof and walls --- utils/file_data_extraction.py | 233 ++++++++++++++++++++++++++++++++-- 1 file changed, 219 insertions(+), 14 deletions(-) diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index 20590afd..80c0c19b 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -587,6 +587,220 @@ class ElmhurstSummaryReportExtractor: return output + @staticmethod + def extract_building_parts(text): + """ + Extracts building parts and associated dimensions from the summary report PDF. + This includes Main Property, multiple extensions if they exist, and Room in Roof areas. + """ + data = [] + + # Locate the Dimensions section + dimensions_section = re.search( + r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL + ) + if not dimensions_section: + raise ValueError("Failed to locate dimensions section in the text.") + + dimensions_text = dimensions_section.group(1) + + # Pattern to extract each building part, starting from Main Property and including extensions + building_part_pattern = re.compile( + r"(Main Property|\d+(?:st|nd|rd|th) Extension)\s*" + r"(.*?)(?=\d+(?:st|nd|rd|th) Extension|5\.0 Conservatory|$)", + re.DOTALL + ) + + # Loop through each building part match, including Main Property and extensions + for match in building_part_pattern.finditer(dimensions_text): + part_name = match.group(1) + floor_data = match.group(2) + + # Pattern to extract floor details: Floor Level, Floor Area, Room Height, Perimeter, Party Wall Length + floor_pattern = re.compile( + r"(1st Floor|Lowest Floor|Second floor):\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + ) + + # Extract data for each floor within the building part + for floor_match in floor_pattern.finditer(floor_data): + floor_level = floor_match.group(1) + floor_area = float(floor_match.group(2)) + room_height = float(floor_match.group(3)) + perimeter = float(floor_match.group(4)) + party_wall_length = float(floor_match.group(5)) + + # Append to data list + data.append( + { + "Building Part": part_name, + "Floor Level": floor_level, + "Floor Area (m2)": floor_area, + "Room Height (m)": room_height, + "Perimeter (m)": perimeter, + "Party Wall Length (m)": party_wall_length + } + ) + + # Check specifically for "Room(s) in Roof" entries, which only have Floor Area + room_in_roof_pattern = re.compile(r"Room\(s\) in Roof:\s*([\d.]+)") + room_in_roof_match = room_in_roof_pattern.search(floor_data) + if room_in_roof_match: + floor_area = float(room_in_roof_match.group(1)) + data.append( + { + "Building Part": part_name, + "Floor Level": "Room in Roof", + "Floor Area (m2)": floor_area, + "Room Height (m)": None, # Placeholder for missing data + "Perimeter (m)": None, # Placeholder for missing data + "Party Wall Length (m)": None # Placeholder for missing data + } + ) + + # Calculate aggregated dimensions + main_property = [part for part in data if "Main Property" in part["Building Part"]] + first_extensions = [part for part in data if "1st Extension" in part["Building Part"]] + dimensions = { + "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]), + "Total Ground Floor Area (m2)": sum( + [part["Floor Area (m2)"] for part in data if "Lowest Floor" in part["Floor Level"]] + ), + "RIR Floor Area": sum( + [part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]] + ), + "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property if + x["Perimeter (m)"] and x["Room Height (m)"]]), + "First Extension Wall Area (m2)": sum( + [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions if + x["Perimeter (m)"] and x["Room Height (m)"]] + ), + } + + return dimensions + + @staticmethod + def extract_roof_details(text): + """ + Extracts roof type, insulation, and insulation thickness for each building part + in the 8.0 Roofs section of the summary report. + """ + # Define data structure to hold results + roof_data = [] + + # Locate the entire 8.0 Roofs section + roof_section_match = re.search(r"8\.0 Roofs:\n(.*?)(?=\n9\.0 Floors:|$)", text, re.DOTALL) + if not roof_section_match: + return roof_data # Return empty if no roof section is found + + # Extract the roof section and append "9.0 Floors:" as the boundary + roof_section = roof_section_match.group(1).strip() + "\n9.0 Floors:" + + # Define pattern to match each building part's roof entry + building_part_pattern = re.compile( + r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label + r"Type\s+(.*?)(?=\n(?:Insulation|9\.0 Floors:|[A-Z]))" # Matches Roof Type until the next field, label, + # or end + r"(?:\nInsulation\s+(.*?)(?=\n(?:Insulation Thickness|9\.0 Floors:|[A-Z])))?" # Optional Insulation + r"(?:\nInsulation Thickness\s+(.*?)(?=\n(?:9\.0 Floors:|[A-Z])))?", # Optional Insulation Thickness + re.DOTALL + ) + + # Extract each building part's data + for match in building_part_pattern.finditer(roof_section): + part_name = match.group(1).strip() # Building part label + roof_type = match.group(2).strip() # Roof Type + roof_insulation = match.group(3).strip() if match.group(3) else None # Optional Insulation + roof_insulation_thickness = match.group(4).strip() if match.group(4) else None # Optional Thickness + + # Cleaning to handle annoying cases when it comes out like this: + # 'A Another dwelling above\n1st Extension' + if roof_type.startswith("A Another dwelling above"): + roof_type = "A Another dwelling above" + + # Store results for this building part + roof_data.append( + { + "Building Part": part_name, + "Roof Type": roof_type, + "Roof Insulation": roof_insulation, + "Roof Insulation Thickness": roof_insulation_thickness, + } + ) + + return roof_data + + @staticmethod + def extract_wall_details(text): + """ + Extracts wall type, insulation, dry-lining, and thickness for each building part, + including any alternative wall details within the 7.0 Walls section of the summary PDF text. + """ + # Define data structure to hold all building part wall entries + wall_data = [] + + # Locate the entire 7.0 Walls section + wall_section = re.search(r"7\.0 Walls:\n(.*?)\n8\.0 Roofs:", text, re.DOTALL).group(1) + + # Define pattern to match each building part's wall entry within the section + building_part_pattern = re.compile( + r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label + r"Type\s+(.*?)\n" # Matches main wall Type + r"Insulation\s+(.*?)\n" # Matches main wall Insulation + r"(Dry-lining\s+(.*?)\n)?" # Optional main wall Dry-lining + r"Wall Thickness Unknown\s+(.*?)\n" # Matches main wall Thickness Unknown + r"Wall Thickness \[mm\]\s+(\d+)", # Matches main wall Thickness + re.DOTALL + ) + + # Define pattern to capture alternative wall details, if present + alternative_wall_pattern = re.compile( + r"Alternative Wall Area.*?\n" # Matches start of alternative wall section + r"Alternative Type\s+(.*?)\n" # Matches alternative wall Type + r"Alternative Insulation\s+(.*?)\n" # Matches alternative wall Insulation + r"(Alternative Dry-lining\s+(.*?)\n)?" # Optional Alternative Dry-lining + r"Alternative Wall Thickness Unknown\s+(.*?)\n" # Matches alternative wall Thickness Unknown + r"Alternative Wall Thickness\s+(\d+)", # Matches alternative wall Thickness + re.DOTALL + ) + + # Find all building part entries within the 7.0 Walls section + for match in building_part_pattern.finditer(wall_section): + wall_label = match.group(1).strip() + main_wall_type = match.group(2).strip() + main_wall_insulation = match.group(3).strip() + main_wall_dry_lining = match.group(5).strip() if match.group(5) else "N/A" + main_wall_thickness_unknown = match.group(6).strip() + main_wall_thickness = int(match.group(7)) + + # Initialize dictionary for this wall entry + wall_entry = { + "Building Part": wall_label, + "Wall Type": main_wall_type, + "Wall Insulation": main_wall_insulation, + "Wall Dry-lining": main_wall_dry_lining, + "Wall Thickness Unknown": main_wall_thickness_unknown, + "Wall Thickness (mm)": main_wall_thickness, + "Alternative Wall Type": None, + "Alternative Wall Insulation": None, + "Alternative Wall Dry-lining": "N/A", + "Alternative Wall Thickness Unknown": None, + "Alternative Wall Thickness (mm)": None, + } + + # Check if there's an alternative wall section following this wall entry + alt_match = alternative_wall_pattern.search(wall_section, match.end()) + if alt_match: + wall_entry["Alternative Wall Type"] = alt_match.group(1).strip() + wall_entry["Alternative Wall Insulation"] = alt_match.group(2).strip() + wall_entry["Alternative Wall Dry-lining"] = alt_match.group(4).strip() if alt_match.group(4) else "N/A" + wall_entry["Alternative Wall Thickness Unknown"] = alt_match.group(5).strip() + wall_entry["Alternative Wall Thickness (mm)"] = int(alt_match.group(6)) + + # Append each building part as a dictionary in the wall_data list + wall_data.append(wall_entry) + + return wall_data + def extract(self): """ Extracts specific data from the provided PDF file. @@ -687,25 +901,16 @@ class ElmhurstSummaryReportExtractor: data["Primary Heating"] = self.extract_primary_heating(text) data["Secondary Heating"] = self.extract_secondary_heating_details(text) - - # Extract Secondary Heating Section - - # Extract Secondary Heating and Water Heating Codes + data["Building Parts"] = self.extract_building_parts(text) + data["Roof Details"] = self.extract_roof_details(text) + data["Wall Details"] = self.extract_wall_details(text) water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) + if not water_heating_code_match: + raise ValueError("Failed to extract water heating code.") data["Water Heating Code"] = water_heating_code_match.group(1).strip() - dimensions = extract_building_parts_summary(text) - data.update(dimensions) - - extracted_roof_data = extract_roof_details_summary(text) - main_roof_data = [roof for roof in extracted_roof_data if "Main" in roof["Building Part"]][0] - data["Main Roof Type"] = main_roof_data["Roof Type"] - data["Main Roof Insulation"] = main_roof_data["Roof Insulation"] - data["Main Roof Insulation Thickness"] = main_roof_data["Roof Insulation Thickness"] - - walls_data = extract_wall_details_summary(text) # Get the main building wall data main_building_walls = [wall for wall in walls_data if "Main" in wall["Building Part"]][0] data["Main Wall Type"] = main_building_walls["Wall Type"] From 8b875cbccfc2ce5b0f00ed55f4466af7fec165f2 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 09:44:26 +0000 Subject: [PATCH 10/23] done with summary report extraction --- utils/file_data_extraction.py | 75 +++++++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 16 deletions(-) diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index 80c0c19b..f5e014a4 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -801,6 +801,64 @@ class ElmhurstSummaryReportExtractor: return wall_data + @staticmethod + def extract_conservatory(text): + """ + Extracts conservatory data from the provided text. + The section is located between "5.0 Conservatory" and "7.0 Walls". + + Args: + text (str): The full text of the Summary Report PDF. + + Returns: + dict: A dictionary with conservatory details: + - "Conservatory Present" + - "Conservatory Separated" + - "Conservatory Floor Area" + - "Conservatory Double Glazed" + - "Conservatory Glazed Perimeter" + - "Heated Conservatory Height" + """ + + # Extract the section between "5.0 Conservatory" and "7.0 Walls" + conservatory_match = re.search(r"5\.0 Conservatory:(.*?)7\.0 Walls:", text, re.DOTALL) + if not conservatory_match: + logger.error("Failed to extract conservatory data.") + raise ValueError("Could not extract conservatory data.") + + conservatory_text = conservatory_match.group(1) + + # Check if conservatory is present + present_match = re.search(r"Is there a conservatory\?\s*(Yes|No)", conservatory_text, re.IGNORECASE) + + if not present_match or present_match.group(1).strip().lower() == "no": + return { + "Conservatory Present": "No", + "Conservatory Separated": "", + "Conservatory Floor Area": 0, + "Conservatory Double Glazed": "", + "Conservatory Glazed Perimeter": 0, + "Heated Conservatory Height": "", + } + + # If we get here, raise a temporary exception since we've not seen a case of this, so should make sure + # this is correct + + separated_match = re.search(r"Is it thermally separated\?\s*(Yes|No)", conservatory_text, re.IGNORECASE) + floor_area_match = re.search(r"Floor Area \[m2\]\s*([\d.]+)", conservatory_text, re.IGNORECASE) + double_glazed_match = re.search(r"Double Glazed\s*(Yes|No)", conservatory_text, re.IGNORECASE) + glazed_perimeter_match = re.search(r"Glazed Perimeter \[m\]\s*([\d.]+)", conservatory_text, re.IGNORECASE) + height_match = re.search(r"Room Height\s*(.*?)(?=\n|$)", conservatory_text, re.IGNORECASE) + + return { + "Conservatory Present": "Yes", + "Conservatory Separated": separated_match.group(1).strip() if separated_match else "", + "Conservatory Floor Area": float(floor_area_match.group(1)) if floor_area_match else 0, + "Conservatory Double Glazed": double_glazed_match.group(1).strip() if double_glazed_match else "", + "Conservatory Glazed Perimeter": float(glazed_perimeter_match.group(1)) if glazed_perimeter_match else 0, + "Heated Conservatory Height": height_match.group(1).strip() if height_match else "", + } + def extract(self): """ Extracts specific data from the provided PDF file. @@ -810,11 +868,6 @@ class ElmhurstSummaryReportExtractor: - Address """ - # Expected keys: - # dict_keys([ - # 'Primary Heating', 'Secondary Heating', 'Building Parts', 'Roof Details', 'Wall Details', 'Conservatory', - # 'Water Heating Code']) - data = {} with (open(self.file_path, "rb") as file): @@ -904,6 +957,7 @@ class ElmhurstSummaryReportExtractor: data["Building Parts"] = self.extract_building_parts(text) data["Roof Details"] = self.extract_roof_details(text) data["Wall Details"] = self.extract_wall_details(text) + data["Conservatory"] = self.extract_conservatory(text) water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) if not water_heating_code_match: @@ -911,15 +965,4 @@ class ElmhurstSummaryReportExtractor: data["Water Heating Code"] = water_heating_code_match.group(1).strip() - # Get the main building wall data - main_building_walls = [wall for wall in walls_data if "Main" in wall["Building Part"]][0] - data["Main Wall Type"] = main_building_walls["Wall Type"] - data["Main Wall Insulation"] = main_building_walls["Wall Insulation"] - data["Main Wall Dry-lining"] = main_building_walls["Wall Dry-lining"] - data["Main Wall Thickness"] = main_building_walls["Wall Thickness (mm)"] - data["Main Building Alternative Wall Type"] = main_building_walls["Alternative Wall Type"] - data["Main Building Alternative Wall Insulation"] = main_building_walls["Alternative Wall Insulation"] - data["Main Building Alternative Wall Dry-lining"] = main_building_walls["Alternative Wall Dry-lining"] - data["Main Building Alternative Wall Thickness"] = main_building_walls["Alternative Wall Thickness (mm)"] - return data From 5a2ffe646ccecd9559b8c39d211d726ed8c547d8 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 12:00:43 +0000 Subject: [PATCH 11/23] implementing summary report extraction --- etl/lodgement/app.py | 33 ++++++++++++++++- utils/file_data_extraction.py | 69 ++++++++++++++++++++--------------- 2 files changed, 71 insertions(+), 31 deletions(-) diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index 3688ca19..629c10e0 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -170,8 +170,8 @@ def handler(): epr_to_insert = { "Postcode": extracted_contents["elmhurst epr"]["Postcode"], - "City/County": None, - "District/Town": None, + "City/County": extracted_contents["elmhurst epr"]["County"], + "District/Town": extracted_contents["elmhurst epr"]["Town"], "Local Authority": None, 'SAP Rating Pre (from IMA)': extracted_contents["elmhurst epr"]["Current SAP Rating"], 'Pre Heat Transfer': pre_heat_transfer, @@ -207,6 +207,35 @@ def handler(): cr_to_insert ) + if extracted_contents.get("elmhurst summary report"): + total_floor_area = sum( + [x["Floor Area (m2)"] for x in extracted_contents["elmhurst summary report"]["Building Parts"]] + + # Get the conservatory floor area + [extracted_contents["elmhurst summary report"]["Conservatory"]["Conservatory Floor Area"]] + ) + + pre_heat_transfer = ( + extracted_contents["elmhurst summary report"]["Primary Energy Use Intensity (kWh/m2/yr)"] + ) + pre_heat_demand = None # Don't have this + + summary_to_insert = { + "Postcode": extracted_contents["elmhurst summary report"]["Postcode"], + "City/County": extracted_contents["elmhurst summary report"]["County"], + "District/Town": extracted_contents["elmhurst summary report"]["Town"], + 'SAP Rating Pre (from IMA)': extracted_contents["elmhurst summary report"]["Current SAP Rating"], + 'Pre Heat Transfer': pre_heat_transfer, + 'Pre Total Floor Area': total_floor_area, + 'Pre Heat Demand': pre_heat_demand, + "R. Assessor - Name": extracted_contents["elmhurst summary report"]["Assessor Name"], + "Retrofit Assessment Date": extracted_contents["elmhurst summary report"]["Assessment Date"], + } + + update_dictionary_with_check( + output_row_data, + summary_to_insert + ) + extracted.append(output_row_data) extracted_df = pd.DataFrame(extracted) diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index f5e014a4..c3cc8a10 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -398,6 +398,15 @@ class ElmhurstEprExtractor: data["Address"] = address_match.group(1).strip() data["Postcode"] = data["Address"].split(",")[-1].strip() + # TODO: + data["Region"] = None + data["House Name"] = None + data["House No"] = None + data["Street"] = None + data["Locality"] = None + data["Town"] = None + data["County"] = None + sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text) if not sap_match: logger.error("Failed to extract SAP rating.") @@ -657,26 +666,7 @@ class ElmhurstSummaryReportExtractor: } ) - # Calculate aggregated dimensions - main_property = [part for part in data if "Main Property" in part["Building Part"]] - first_extensions = [part for part in data if "1st Extension" in part["Building Part"]] - dimensions = { - "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]), - "Total Ground Floor Area (m2)": sum( - [part["Floor Area (m2)"] for part in data if "Lowest Floor" in part["Floor Level"]] - ), - "RIR Floor Area": sum( - [part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]] - ), - "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property if - x["Perimeter (m)"] and x["Room Height (m)"]]), - "First Extension Wall Area (m2)": sum( - [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions if - x["Perimeter (m)"] and x["Room Height (m)"]] - ), - } - - return dimensions + return data @staticmethod def extract_roof_details(text): @@ -869,7 +859,6 @@ class ElmhurstSummaryReportExtractor: """ data = {} - with (open(self.file_path, "rb") as file): reader = PyPDF2.PdfReader(file) text = "" @@ -885,29 +874,51 @@ class ElmhurstSummaryReportExtractor: # Address and postcode postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text) + postcode = postcode.group(1).strip() if postcode else "" + region = re.search(r"Region:\s*(.*?)\nHouse Name:", text) + region = region.group(1).strip() if region else "" + house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text) + house_name = house_name.group(1).strip() if house_name else "" + house_no = re.search(r"House No:\s*(.*?)\nStreet:", text) + house_no = house_no.group(1).strip() if house_no else "" + street = re.search(r"Street:\s*(.*?)\nLocality:", text) + street = street.group(1).strip() if street else "" + locality = re.search(r"Locality:\s*(.*?)\nTown:", text) + locality = locality.group(1).strip() if locality else "" + town = re.search(r"Town:\s*(.*?)\nCounty:", text) + town = town.group(1).strip() if town else "" + county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text) + county = county.group(1).strip() if county else "" # Clean extracted values and remove any prefixes address_parts = [ - house_no.group(1).strip() if house_no else "", - house_name.group(1).strip() if house_name else "", - street.group(1).strip() if street else "", - locality.group(1).strip() if locality else "", - town.group(1).strip() if town else "", - county.group(1).strip() if county else "", - region.group(1).strip() if region else "", - postcode.group(1).strip() if postcode else "" + house_no, + house_name, + street, + locality, + town, + county, + region, + postcode ] # Join non-empty parts with a comma data["Address"] = ", ".join([part for part in address_parts if part]) data["Postcode"] = postcode.group(1).strip() + data["Region"] = region + data["House Name"] = house_name + data["House No"] = house_no + data["Street"] = street + data["Locality"] = locality + data["Town"] = town + data["County"] = county # Extract Current SAP rating sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) From 5e7827f706d9ad6f55f518d5b8dc9acb04ebd50e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 14:02:55 +0000 Subject: [PATCH 12/23] added ocr extraction for permeability report --- etl/lodgement/app.py | 37 ++++++++++-- etl/lodgement/requirements.txt | 3 + utils/file_data_extraction.py | 100 +++++++++++++++++++++++++++++---- 3 files changed, 124 insertions(+), 16 deletions(-) diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index 629c10e0..6fe9fdc4 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -85,6 +85,10 @@ def handler(): customer_phone = "0345 678 9000" customer_email = "affordablewarmth@shropshire.gov.uk" + # TODO: In order for this to go live, we need to use Poppler, which needs to be installed + # w/ brew install poppler + # We also need to install Tesseract: brew install tesseract + # List the folders in the source data path folders = [x for x in os.listdir(source_data_path) if os.path.isdir(os.path.join(source_data_path, x))] @@ -94,25 +98,28 @@ def handler(): "osmosis condition report": OsmosisConditionReportParser, "elmhurst evidence report": None, "full sap xml": FullSapParser, + "pulse air permeability": file_extraction_tools.PulseAirPermeabilityExtractor } extracted = [] for property_folder in folders: - coordinator_folder = os.path.join(source_data_path, property_folder, "2. RA Coordinator Info") - # Check if this folder exists - if not os.path.exists(coordinator_folder): - coordinator_folder = os.path.join(source_data_path, property_folder, "1. RA Coordinator Info") + property_folder_path = os.path.join(source_data_path, property_folder) + # List the folders in the source data path + subfolders = [ + x for x in os.listdir(property_folder_path) if os.path.isdir(os.path.join(property_folder_path, x)) + ] + coord_folder = os.path.join(property_folder_path, [f for f in subfolders if "RA Coordinator Info" in f][0]) # Get the contents of the folder coordinator_folder_contents = [ - file for file in os.listdir(coordinator_folder) if os.path.isfile(os.path.join(coordinator_folder, file)) + file for file in os.listdir(coord_folder) if os.path.isfile(os.path.join(coord_folder, file)) ] # We detect the various file types extracted_contents = {} for filename in coordinator_folder_contents: - filepath = os.path.join(coordinator_folder, filename) + filepath = os.path.join(coord_folder, filename) if file_extraction_tools.is_pdf(filepath): report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath) if report_type is None: @@ -134,6 +141,24 @@ def handler(): extracted_contents[xml_type] = file_extractor(filepath).extract() + att_folder = os.path.join(property_folder_path, [f for f in subfolders if "Air Tightness Tests" in f][0]) + att_folder_contents = [ + file for file in os.listdir(att_folder) if os.path.isfile(os.path.join(att_folder, file)) + ] + + for filename in att_folder_contents: + filepath = os.path.join(att_folder, filename) + if file_extraction_tools.is_pdf(filepath): + report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath) + if report_type is None: + raise ValueError(f"Unknown report type for {filename}") + file_extractor = extractors[report_type] + + if file_extractor is None: + continue + + extracted_contents[report_type] = file_extractor(filepath).extract() + output_row_data = output_template.copy() # dict_keys([ 'City/County', 'District/Town', diff --git a/etl/lodgement/requirements.txt b/etl/lodgement/requirements.txt index 75c63b26..09e475fe 100644 --- a/etl/lodgement/requirements.txt +++ b/etl/lodgement/requirements.txt @@ -8,3 +8,6 @@ fuzzywuzzy==0.18.0 python-dotenv python-docx pymupdf +pytesseract +pdf2image +pillow diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index c3cc8a10..c60f01b4 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -3,6 +3,8 @@ import re from collections import Counter from utils.logger import setup_logger from xml.dom.minidom import parseString +from pdf2image import convert_from_path +from pytesseract import image_to_string logger = setup_logger() @@ -41,11 +43,17 @@ def is_elmhurst_evidence_report(text): return text.startswith("RdSAP Evidence Report") +def is_pulse_air_permeability(text): + """ + Determines if the provided text indicates that the PDF is a Pulse Air Permeability Report. + """ + return text.startswith("Air Permeability Test Report @O PULSE") + + def detect_pdf_report_type(pdf_path): """ Detects the type of report based on content or filename. :param pdf_path: String path to the PDF file - :param pdf_file: String name of the PDF file :return: String type of the report ("epr", "summary", or None) """ # Attempt to read the first page of the PDF to determine type @@ -53,14 +61,23 @@ def detect_pdf_report_type(pdf_path): reader = PyPDF2.PdfReader(file) first_page_text = reader.pages[0].extract_text() if reader.pages else "" - if is_elmhurst_energy_report(first_page_text): - return "elmhurst epr" - elif is_elmhurst_summary_report(first_page_text): - return "elmhurst summary report" - elif is_osmosis_condition_report(first_page_text): - return "osmosis condition report" - elif is_elmhurst_evidence_report(first_page_text): - return "elmhurst evidence report" + if first_page_text == "": + # Convert PDF pages to images + logger.info("Extracting text from PDF images..., this may take a moment.") + pages = convert_from_path(pdf_path, dpi=300) + if pages: + first_page_text = image_to_string(pages[0]) + + if is_elmhurst_energy_report(first_page_text): + return "elmhurst epr" + elif is_elmhurst_summary_report(first_page_text): + return "elmhurst summary report" + elif is_osmosis_condition_report(first_page_text): + return "osmosis condition report" + elif is_elmhurst_evidence_report(first_page_text): + return "elmhurst evidence report" + elif is_pulse_air_permeability(first_page_text): + return "pulse air permeability" return None @@ -911,7 +928,7 @@ class ElmhurstSummaryReportExtractor: # Join non-empty parts with a comma data["Address"] = ", ".join([part for part in address_parts if part]) - data["Postcode"] = postcode.group(1).strip() + data["Postcode"] = postcode data["Region"] = region data["House Name"] = house_name data["House No"] = house_no @@ -977,3 +994,66 @@ class ElmhurstSummaryReportExtractor: data["Water Heating Code"] = water_heating_code_match.group(1).strip() return data + + +class PulseAirPermeabilityExtractor: + """ + A utility class for extracting specific data from Pulse Air Permeability Test Reports. + """ + + def __init__(self, file_path): + self.file_path = file_path + + @staticmethod + def extract_table(text): + patterns = { + "Air Leakage Rate": r"Air Leakage Rate\s*([\d,@.]+)\s*m/h\s*([\d,@.]+)\s*m3/h", + "Air Permeability": r"Air Permeability\s*([\d,@.]+)\s*=.*?\s*([\d,@.]+)\s*m\?/m\?h", + "Air Changes per Hour": r"Air Changes per Hour\s*([\d,@.]+)\s*([\d,@.]+)", + "Equivalent Leakage Area": r"Equivalent Leakage Area\s*([\d,@.]+)\s*([\d,@.]+)", + "Calculation Uncertainty": r"Calculation Uncertainty\s*([\d,@.]+)\s*([\d,@.]+)", + } + + # Initialize results dictionary + table_data = [] + + # Parse each metric using the corresponding regex + for metric, pattern in patterns.items(): + match = re.search(pattern, text) + if match: + # Extract the two column values + first_value = match.group(1) + second_value = match.group(2) + + # Post-process values: replace '@' with '0' and remove commas + first_value = first_value.replace("@", "0").replace(",", "") + second_value = second_value.replace("@", "0").replace(",", "") + + table_data.append( + { + "Metric": metric, + "Measured @ 4PA": first_value, + "Extrapolated @ 50PA": second_value, + } + ) + else: + raise ValueError(f"Could not extract metric: {metric}") + + return table_data + + def extract(self): + # Extract the pdf using tesseract + logger.info("Extracting data from pdf image - this may take a while...") + pages = convert_from_path(self.file_path, dpi=300) + # Extract all of the pages + text = "" + for page in pages: + text += image_to_string(page) + + # We extract the air permeability reading + results_table = self.extract_table(text) + data = { + "Results Table": results_table + } + + return data From 2cfc88104441c8a2d48015a4b8d3631f9c260259 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 14:08:31 +0000 Subject: [PATCH 13/23] extracted air tightness --- etl/lodgement/app.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index 6fe9fdc4..c75ece4c 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -261,6 +261,15 @@ def handler(): summary_to_insert ) + if extracted_contents.get("pulse air permeability"): + # We extract the AP50 number + results_table = extracted_contents["pulse air permeability"]["Results Table"] + ap50 = [x["Extrapolated @ 50PA"] for x in results_table if x["Metric"] == "Air Permeability"][0] + update_dictionary_with_check( + output_row_data, + {"Pre Air Tightness": ap50} + ) + extracted.append(output_row_data) extracted_df = pd.DataFrame(extracted) From 3cd9670d1aa49b7b71b9fa59739b82ab2b9e62dd Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 28 Nov 2024 15:53:48 +0000 Subject: [PATCH 14/23] adding file detection for elmhurst project handover --- etl/lodgement/app.py | 29 ++++++++++++++++++++++++++++- utils/file_data_extraction.py | 21 +++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index c75ece4c..2bdeb3d7 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -98,7 +98,8 @@ def handler(): "osmosis condition report": OsmosisConditionReportParser, "elmhurst evidence report": None, "full sap xml": FullSapParser, - "pulse air permeability": file_extraction_tools.PulseAirPermeabilityExtractor + "pulse air permeability": file_extraction_tools.PulseAirPermeabilityExtractor, + "elmhurst project handover": file_extraction_tools.ElmhurstProjectHandoverExtractor, } extracted = [] @@ -159,6 +160,32 @@ def handler(): extracted_contents[report_type] = file_extractor(filepath).extract() + lodgement_folder = os.path.join( + property_folder_path, [f for f in subfolders if "TrustMark Lodgement" in f][0] + ) + # Within the lodgement folder, we want the required documents sub-folder + lodgement_subfolders = [ + file for file in os.listdir(lodgement_folder) if os.path.isdir(os.path.join(lodgement_folder, file)) + ] + required_documents_folder = os.path.join( + lodgement_folder, [f for f in lodgement_subfolders if "required documents" in f.lower()][0] + ) + # List the contents + required_documents_contents = [ + file for file in os.listdir(required_documents_folder) if + os.path.isfile(os.path.join(required_documents_folder, file)) + ] + + # There are only a few file types we actually want to process in here for the moment + for filename in required_documents_contents: + filepath = os.path.join(required_documents_folder, filename) + if file_extraction_tools.is_pdf(filepath): + report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath) + if report_type != "elmhurst project handover": + continue + blah + file_extractor = extractors[report_type] + output_row_data = output_template.copy() # dict_keys([ 'City/County', 'District/Town', diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index c60f01b4..ef02e7f0 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -50,6 +50,13 @@ def is_pulse_air_permeability(text): return text.startswith("Air Permeability Test Report @O PULSE") +def is_elmhurst_project_handover(text): + """ + Determines if the provided text indicates that the PDF is an Elmhurst Project Handover Report. + """ + return "Retrofit_Project_Handover" in text or "Retrofit Project Handover" in text + + def detect_pdf_report_type(pdf_path): """ Detects the type of report based on content or filename. @@ -78,6 +85,8 @@ def detect_pdf_report_type(pdf_path): return "elmhurst evidence report" elif is_pulse_air_permeability(first_page_text): return "pulse air permeability" + elif is_elmhurst_project_handover(first_page_text): + return "elmhurst project handover" return None @@ -1057,3 +1066,15 @@ class PulseAirPermeabilityExtractor: } return data + + +class ElmhurstProjectHandoverExtractor: + """ + A utility class for extracting specific data from The Elmhurst Project Handover document + """ + + def __init__(self, file_path): + self.file_path = file_path + + def extract(self): + pass From c6e02836a88cd2a4af7dc8a6ee10e160d6e60f68 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 29 Nov 2024 12:10:29 +0000 Subject: [PATCH 15/23] poc done for now --- etl/lodgement/app.py | 34 +++++++++++++--- etl/lodgement/requirements.txt | 1 + utils/file_data_extraction.py | 72 +++++++++++++++++++++++++++++++++- 3 files changed, 101 insertions(+), 6 deletions(-) diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py index 2bdeb3d7..c1da35dd 100644 --- a/etl/lodgement/app.py +++ b/etl/lodgement/app.py @@ -29,6 +29,7 @@ output_template = { "Tenant - Name": None, "Tenant - Phone": None, "R. Assessor - Name": None, + "R. Coordinator - Name": None, "Trustmark Licence Number": None, "Retrofit Assessment Date": None, "Company Name": None, @@ -100,6 +101,7 @@ def handler(): "full sap xml": FullSapParser, "pulse air permeability": file_extraction_tools.PulseAirPermeabilityExtractor, "elmhurst project handover": file_extraction_tools.ElmhurstProjectHandoverExtractor, + "core logic pas assessment report": file_extraction_tools.CoreLogicPasAssessmentReportExtractor, } extracted = [] @@ -183,9 +185,10 @@ def handler(): report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath) if report_type != "elmhurst project handover": continue - blah file_extractor = extractors[report_type] + extracted_contents[report_type] = file_extractor(filepath).extract() + output_row_data = output_template.copy() # dict_keys([ 'City/County', 'District/Town', @@ -193,11 +196,9 @@ def handler(): # 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date', 'Owner - Name', 'Owner - Phone', # 'Owner - Email', 'Tenant - Name', 'Tenant - Phone', # 'Trustmark Licence Number', - # 'Company Name', 'Retrofit Designer Name', # Pre Air Tightness', 'SAP Rating Post (from EPC)', 'Post Heat - # Transfer', 'Post Total Floor Area', 'Post Heat Demand', 'Post Air Tightness', 'Number of Eligible Measures - # Installed', 'Total Cost of Works', 'Annual Fuel Saving (MTP)']) - # Populate the output row data + # Transfer', 'Post Total Floor Area', 'Post Heat Demand', 'Post Air Tightness', + # 'Total Cost of Works', 'Annual Fuel Saving (MTP)']) update_dictionary_with_check( output_row_data, @@ -297,6 +298,29 @@ def handler(): {"Pre Air Tightness": ap50} ) + if extracted_contents.get("elmhurst project handover"): + handover_to_insert = { + "Number of Eligible Measures Installed": len( + extracted_contents["elmhurst project handover"]["Measures Fitted"] + ), + "Retrofit Designer Name": extracted_contents["elmhurst project handover"]["Designer Name"], + "Company Name": extracted_contents["elmhurst project handover"]["Installer Name"], + "R. Coordinator - Name": extracted_contents["elmhurst project handover"]["Retrofit Coordinator Name"], + } + update_dictionary_with_check(output_row_data, handover_to_insert) + + if extracted_contents.get("core logic pas assessment report"): + cr_to_insert = { + "No. of Bedrooms": extracted_contents["core logic pas assessment report"]["Number of bedrooms"], + } + update_dictionary_with_check( + output_row_data, + cr_to_insert + ) + extracted.append(output_row_data) extracted_df = pd.DataFrame(extracted) + + extracted_df.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Lodgment Pilot/poc-extrcted-data.csv", + index=False) diff --git a/etl/lodgement/requirements.txt b/etl/lodgement/requirements.txt index 09e475fe..412aed3b 100644 --- a/etl/lodgement/requirements.txt +++ b/etl/lodgement/requirements.txt @@ -11,3 +11,4 @@ pymupdf pytesseract pdf2image pillow +pdfplumber diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py index ef02e7f0..2e849ef5 100644 --- a/utils/file_data_extraction.py +++ b/utils/file_data_extraction.py @@ -1,5 +1,6 @@ import PyPDF2 import re +import pdfplumber from collections import Counter from utils.logger import setup_logger from xml.dom.minidom import parseString @@ -57,6 +58,13 @@ def is_elmhurst_project_handover(text): return "Retrofit_Project_Handover" in text or "Retrofit Project Handover" in text +def is_core_logic_pas_assessment_report(text): + """ + Determines if the provided text indicates that the PDF is a PAS Assessment Report. + """ + return text.startswith("Generated Using CoreLogic UK PAS Assessment") + + def detect_pdf_report_type(pdf_path): """ Detects the type of report based on content or filename. @@ -87,6 +95,8 @@ def detect_pdf_report_type(pdf_path): return "pulse air permeability" elif is_elmhurst_project_handover(first_page_text): return "elmhurst project handover" + elif is_core_logic_pas_assessment_report(first_page_text): + return "core logic pas assessment report" return None @@ -1077,4 +1087,64 @@ class ElmhurstProjectHandoverExtractor: self.file_path = file_path def extract(self): - pass + + with (open(self.file_path, "rb") as file): + reader = PyPDF2.PdfReader(file) + text = "" + for page in reader.pages: + text += page.extract_text() + + data = {} + + # Regex patterns + patterns = { + "Retrofit Coordinator Name": r"Retrofit Coordinator Name:\s*(.+)", + "Retrofit Coordinator ID": r"Retrofit Coordinator ID:\s*(\d+)", + "Measures Fitted": r"Measure\(s\) Fitted:\s*([\s\S]*?)\nRetrofit Assessor Name:", + "Designer Name": r"Designer Name\(s\):\s*(.+)", + "Installer Name": r"Installer Name\(s\):\s*(.+)", + } + + # Extract data + for key, pattern in patterns.items(): + match = re.search(pattern, text) + if not match: + raise ValueError(f"Could not match {key}") + if match: + if key == "Measures Fitted": + # Special handling for multiline measures + measures = re.findall(r"[\u2022\u00b7\u25cf\uf0b7]\s*(.+)", match.group(1)) + measures = [m.strip() for m in measures] + data[key] = measures + else: + data[key] = match.group(1).strip() if match else "" + + return data + + +class CoreLogicPasAssessmentReportExtractor: + """ + A utility class for extracting specific data from CoreLogic PAS Assessment Reports. + """ + + def __init__(self, file_path): + self.file_path = file_path + + def extract(self): + data = {} + + with pdfplumber.open(self.file_path) as pdf: + for page in pdf.pages: + tables = page.extract_tables() + if tables: # If tables are detected on the page + for table in tables: + for row in table: + # Check if the row contains "Number of bedrooms" + if any("Number of bedrooms" in str(cell) for cell in row): + # Extract the corresponding value by filtering out None and non-relevant cells + for cell in row: + if cell and cell.strip().isdigit(): # Check if cell contains a numeric value + data["Number of bedrooms"] = int(cell.strip()) + break # Stop further processing once value is found + + return data From c806ef71516d7fda620f854262b7360937b48b10 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 29 Nov 2024 15:12:14 +0000 Subject: [PATCH 16/23] modified the hug postcodes data --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- etl/access_reporting/app.py | 0 etl/customers/gla/hug_postcodes.py | 29 ++++++++++++++++++++++++++++- 4 files changed, 30 insertions(+), 3 deletions(-) create mode 100644 etl/access_reporting/app.py diff --git a/.idea/Model.iml b/.idea/Model.iml index 9b63b142..df6c4faa 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index acd935c1..50cad4ca 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/access_reporting/app.py b/etl/access_reporting/app.py new file mode 100644 index 00000000..e69de29b diff --git a/etl/customers/gla/hug_postcodes.py b/etl/customers/gla/hug_postcodes.py index 85783d62..ac2d1e3c 100644 --- a/etl/customers/gla/hug_postcodes.py +++ b/etl/customers/gla/hug_postcodes.py @@ -3,6 +3,7 @@ import pandas as pd from pathlib import Path from tqdm import tqdm from etl.epc.settings import EARLIEST_EPC_DATE +from etl.spatial.OpenUprnClient import OpenUprnClient src_file_path = inspect.getfile(lambda: None) @@ -22,6 +23,7 @@ for directory in tqdm(epc_directories): data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE] data = data[~pd.isnull(data["uprn"])] + data["uprn"] = data["uprn"].astype(int) # Take just the newest EPC per uprn, based on lodgement-date data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn") # Take EPC D and below @@ -31,16 +33,41 @@ for directory in tqdm(epc_directories): # Take homes that don't have a gas boiler off_gas = data[~data["main-fuel"].str.contains("mains gas", case=False, na=False)] + if off_gas.empty: + continue + + # Remote properties with conservation area issues + uprns = off_gas["uprn"].unique() + # Get data + ca_data = OpenUprnClient.get_spatial_data(uprns, "retrofit-data-dev") + off_gas = off_gas.merge( + ca_data[["UPRN", "conservation_status", "is_listed_building", "is_heritage_building"]].rename( + columns={"UPRN": "uprn"} + ), + how="left", + on="uprn", + ) + # Remove any restricted units + off_gas = off_gas[ + (off_gas["conservation_status"] != True) + & (off_gas["is_listed_building"] != True) + & (off_gas["is_heritage_building"] != True) + ] + region_summary = off_gas.groupby("postal_region").size().reset_index(name="count") aggregation.append(region_summary) postal_region_aggregation = pd.concat(aggregation) +# Re-aggregate +postal_region_aggregation = postal_region_aggregation.groupby("postal_region")["count"].sum().reset_index() + postal_region_aggregation = postal_region_aggregation.sort_values("count", ascending=False) postal_region_aggregation = postal_region_aggregation.rename( columns={"postal_region": "Postcode Region", "count": "Number of Homes"} ) postal_region_aggregation.to_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/GLA/Off Gas EPC D-G Postal Regions.xlsx", + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/GLA/Off Gas EPC D-G Postal Regions - without conservation " + "area.xlsx", index=False ) From 3e8a1bc4fdf54d9647bafc90806ba9e7f731e69e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 2 Dec 2024 17:50:08 +0000 Subject: [PATCH 17/23] set up cottons asset list --- etl/access_reporting/app.py | 394 ++++++++++++++++++++ etl/access_reporting/requirements.txt | 11 + etl/customers/cottons/remote_assessments.py | 102 +++++ etl/find_my_epc/RetrieveFindMyEpc.py | 1 + etl/route_march_data_pull/app.py | 33 +- 5 files changed, 540 insertions(+), 1 deletion(-) create mode 100644 etl/access_reporting/requirements.txt create mode 100644 etl/customers/cottons/remote_assessments.py diff --git a/etl/access_reporting/app.py b/etl/access_reporting/app.py index e69de29b..830f4370 100644 --- a/etl/access_reporting/app.py +++ b/etl/access_reporting/app.py @@ -0,0 +1,394 @@ +import os +from msal import ConfidentialClientApplication +from datetime import datetime, timedelta +import requests +from functools import wraps +import time +import logging +from io import BytesIO +import pandas as pd + +# Configure logging +logger = logging.getLogger(__name__) +if not logger.handlers: + handler = logging.StreamHandler() + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) +logger.setLevel(logging.INFO) + + +def handle_error(response): + """ + Handle errors based on HTTP status codes and log detailed information. + """ + try: + error_json = response.json().get('error', {}) + except ValueError: + error_json = {} + + error_code = error_json.get('code', 'unknownError') + error_message = error_json.get('message', 'No detailed error message provided.') + inner_error = error_json.get('innererror', {}) + details = error_json.get('details', []) + + logger.error(f"Error Code: {error_code}") + logger.error(f"Error Message: {error_message}") + if inner_error: + logger.error(f"Inner Error: {inner_error}") + if details: + logger.error(f"Error Details: {details}") + + if response.status_code == 401: + logger.error("Unauthorized. Token might be invalid.") + elif response.status_code == 403: + logger.error("Forbidden. Access denied to the requested resource.") + elif response.status_code == 404: + logger.error("Not Found. The requested resource doesn’t exist.") + elif response.status_code == 429: + retry_after = int(response.headers.get('Retry-After', 5)) # Default to 5 seconds if not provided + logger.warning(f"Too Many Requests. Retrying after {retry_after} seconds...") + time.sleep(retry_after) + return 'retry' + elif response.status_code in (500, 503): + retry_after = int(response.headers.get('Retry-After', 5)) # Default to 5 seconds if not provided + logger.error(f"Server error. Retrying after {retry_after} seconds...") + time.sleep(retry_after) + return 'retry' + else: + raise ValueError(f"API request failed with status code {response.status_code} - {error_message}") + + raise ValueError(f"API request failed with status code {response.status_code} - {error_message}") + + +def api_call_decorator(func): + """ + Handles various aspects of the API call, including refreshing the access token if needed and handling pagination. + :param func: The function to be decorated. + :return: The wrapped function. + """ + + @wraps(func) + def wrapper(self, *args, **kwargs): + try: + # Check and refresh the access token if needed + if self.is_access_token_expired(): + self.retrieve_access_token() + logger.info("Access token refreshed.") + + # Get the HTTP method, URL, and optionally data from the function + http_method, url, data = func(self, *args, **kwargs) + + # Initialize the results list and handle pagination if page_size is provided + results = [] + page_size = kwargs.get('page_size', None) + response_data = {} + + while url: + response = requests.request(http_method, url, headers=self.headers, json=data) + + # Handle the response + if response.status_code == 200: + response_json = response.json() # Store the response JSON + if page_size: + results.extend(response_json.get('value', [])) + url = response_json.get('@odata.nextLink', None) + else: + response_data = response_json # Capture the full response for consistency + break + else: + retry = handle_error(response) + if retry == 'retry': + continue + + if page_size: + response_data = {'value': results} + + return response_data + + except Exception as e: + logger.exception("An error occurred during the API call.") + raise e + + return wrapper + + +class SharePointClient: + access_token = None + access_token_request_timestamp = None + access_token_expiry = None + headers = None + + TIMESTAMP_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" + + def __init__(self, tenant_id, client_id, client_secret, site_id, access_token=None, + access_token_expiration_details=None): + """ + Initializes the SharePointClient with necessary credentials and site information. + :param tenant_id: The tenant ID. + :param client_id: The client ID. + :param client_secret: The client secret. + :param site_id: The site ID. + :param access_token: The access token (optional) + :param access_token_expiration_details: The access token expiration details (optional) + """ + self.tenant_id = tenant_id + self.client_id = client_id + self.client_secret = client_secret + + if access_token: + if not access_token_expiration_details: + raise ValueError("Access token expiration details must be provided.") + self.access_token = access_token + self.set_access_token_expiration_details(access_token_expiration_details) + self.headers = { + 'Authorization': f"Bearer {self.access_token['access_token']}" + } + else: + self.retrieve_access_token() + + # Retrieve static identifiers + self.site_id = site_id + self.document_drive = self.get_documents_drive() + + def get_token_expiration_details(self): + """ + Returns the access token expiration details. Converts the datetime objects to strings for serialization. + :return: + """ + return { + 'access_token_request_timestamp': datetime.strftime( + self.access_token_request_timestamp, self.TIMESTAMP_FORMAT + ), + 'access_token_expiry': datetime.strftime(self.access_token_expiry, self.TIMESTAMP_FORMAT) + } + + def set_access_token_expiration_details(self, access_token_expiration_details): + """ + Sets the access token expiration details from a serialized dictionary. + :param access_token_expiration_details: The serialized access token expiration details. + :return: + """ + self.access_token_request_timestamp = datetime.strptime( + access_token_expiration_details['access_token_request_timestamp'], self.TIMESTAMP_FORMAT + ) + self.access_token_expiry = datetime.strptime( + access_token_expiration_details['access_token_expiry'], self.TIMESTAMP_FORMAT + ) + + def is_access_token_expired(self): + """ + Checks if the access token has expired. If it has, a new access token is retrieved. + :return: True if expired, False otherwise. + """ + return datetime.now() >= self.access_token_expiry + + def retrieve_access_token(self, refresh=False): + """ + Implements authentication using MSAL. + :param refresh: If True, force a refresh of the access token. + :return: None + """ + app = ConfidentialClientApplication( + self.client_id, + authority=f"https://login.microsoftonline.com/{self.tenant_id}", + client_credential=self.client_secret + ) + + scope = ["https://graph.microsoft.com/.default"] + + access_token_request_timestamp = datetime.now() + + if refresh: + logger.info("Forcing refresh of access token.") + token = app.acquire_token_for_client(scopes=scope) + else: + # Check if a token is already cached + token = app.acquire_token_silent(scope, account=None) + + if not token: + token = app.acquire_token_for_client(scopes=scope) + + if "access_token" not in token: + logger.error("Authentication failed.") + raise ValueError("Authentication failed") + + access_token_expiry = access_token_request_timestamp + timedelta( + seconds=token['expires_in'] - 20 + ) + + self.access_token = token + self.access_token_request_timestamp = access_token_request_timestamp + self.access_token_expiry = access_token_expiry + self.headers = { + 'Authorization': f"Bearer {self.access_token['access_token']}" + } + + logger.info("Access token retrieved successfully.") + + @api_call_decorator + def get_documents_drive(self): + """ + Get the document drive of the SharePoint site. + :return: Tuple containing HTTP method, URL, and None for data. + """ + url = f"https://graph.microsoft.com/v1.0/sites/{self.site_id}/drive" + logger.info(f"Getting document drive from URL: {url}") + return 'GET', url, None + + @api_call_decorator + def list_folder_contents(self, drive_id, folder_path: str, page_size: int = 100): + """ + This function will list the contents of a folder in SharePoint. + :param drive_id: The ID of the drive. + :param folder_path: The path of the folder. + :param page_size: The number of items per page (default is 100). + :return: Tuple containing HTTP method, URL, and None for data. + """ + url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root:/{folder_path}:/children?$top={page_size}" + logger.info(f"Listing folder contents from URL: {url}") + return 'GET', url, None + + @staticmethod + def download_sharepoint_file(download_url): + """ + Downloads a file from the given URL and returns its content. + + :param download_url: The URL to download the file from. + :return: The content of the downloaded file. + """ + response = requests.get(download_url, stream=True) + response.raise_for_status() # Check if the request was successful + + file_content = BytesIO() + + # Read the file content into memory + for chunk in response.iter_content(chunk_size=8192): + file_content.write(chunk) + + file_content.seek(0) # Reset the file pointer to the beginning + + return file_content + + +def app(): + # Customers for WC 18/11/2024 + # + # ----- Eastlight location ----- + # No data this week, low on data + # Housing Associations/Eastlight/Survey Outcomes/ + # + # ----- Settle location ----- + # No data this week, in separate files + # Housing Associations/Settle/Survey Outcomes/ + # + # ----- Community Housing ----- + # In separate files - will we get to a singular form? + # Housing Associations/Community Housing/Survey Outcomes/ + # + # ----- ACIS location ----- + # Doesn't have this week's data + # Housing Asociation/ACIS/Survey Outcomes/ACIS Group - 25.11.2024 - USE THIS.xlsx + # + # ----- Southern location ----- + # + # + # ------ Unitas location ------ + # Does have this week's data + # Unitas location: Housing Associations/Unitas/Survey Outcomes/Unitas.xlsx + + locations = { + "Unitas": "Housing Associations/Unitas/Survey Outcomes/Unitas.xlsx", + "Eastlight": "Housing Associations/Eastlight/Survey Outcomes/", + "Settle": "Housing Associations/Settle/Survey Outcomes/", + "Community Housing": "Housing Associations/Community Housing/Survey Outcomes/", + "ACIS": "Housing Asociation/ACIS/Survey Outcomes/ACIS Group - 25.11.2024 - USE THIS.xlsx", + "Southern": None, + } + + SHAREPOINT_CLIENT_ID = os.getenv("SHAREPOINT_CLIENT_ID", None) + SHAREPOINT_CLIENT_SECRET = os.getenv("SHAREPOINT_CLIENT_SECRET", None) + SHAREPOINT_TENANT_ID = os.getenv("SHAREPOINT_TENANT_ID", None) + WARMFRONT_SHAREPOINT_SITE_ID = os.getenv("WARMFRONT_SHAREPOINT_SITE_ID", None) + + sharepoint_client = SharePointClient( + tenant_id=SHAREPOINT_TENANT_ID, + client_id=SHAREPOINT_CLIENT_ID, + client_secret=SHAREPOINT_CLIENT_SECRET, + site_id=WARMFRONT_SHAREPOINT_SITE_ID + ) + + results = [] + for customer, location in locations.items(): + if location is None: + continue + + if location.endswith(".xlsx"): + # Read in the file + # List the contents of the folder + location_folder = os.path.dirname(location) + contents = sharepoint_client.list_folder_contents( + drive_id=sharepoint_client.document_drive["id"], + folder_path=location_folder + ) + filepaths = contents["value"] + + download_url = next( + (file['@microsoft.graph.downloadUrl'] for file in filepaths + if '@microsoft.graph.downloadUrl' in file and file['name'] == os.path.basename(location)), + None + ) + + if download_url is None: + raise ValueError("File not found in the SharePoint folder.") + + file_content = sharepoint_client.download_sharepoint_file(download_url) + + # Convert to pandas dataframe since file is an excel file + df = pd.read_excel(file_content) + df["Outcome"] = df["Outcome"].str.strip().str.lower() + + # We cannot group by funding type accurately because any job that is not funded will have a NaN value + # and therefore we have a 100% acces rate for funded jobs and 0% otherwise + surveyor_outcomes = [] + for (week, surveyor, funding), group in df.groupby(["Week Commencing", "DEA/REA"]): + funding_type = [x for x in group["Funding Type"].unique() if not pd.isnull(x)] + if funding_type: + funding_type = " + ".join(funding_type) + else: + funding_type = "No Funding" + surveyed = group[group["Outcome"] == "surveyed"] + no_answer = group[ + group["Outcome"] == "no answer" + ] + other_issue = group[~group["Outcome"].isin(["surveyed", "no answer"])] + + surveyor_outcomes.append( + { + "Surveyor": surveyor, + "Week": week, + "Funding": funding_type, + "Surveyed": surveyed.shape[0], + "No Answer": no_answer.shape[0], + "Other Issue": other_issue.shape[0], + } + ) + + surveyor_outcomes = pd.DataFrame(surveyor_outcomes) + surveyor_outcomes["Week"] = pd.to_datetime(surveyor_outcomes["Week"]) + + weekly_access = ( + surveyor_outcomes.drop(columns=["Surveyor"]).groupby(["Week", "Funding"]).sum().reset_index() + ) + # Sort by week and surveyor ascending + surveyor_outcomes = surveyor_outcomes.sort_values(["Week", "Surveyor"], ascending=[True, True]) + surveyor_outcomes["Access Rate"] = 100 * surveyor_outcomes["Surveyed"] / ( + surveyor_outcomes["Surveyed"] + surveyor_outcomes["No Answer"] + surveyor_outcomes["Other Issue"] + ) + + weekly_access["Total"] = ( + weekly_access["Surveyed"] + weekly_access["No Answer"] + weekly_access["Other Issue"] + ) + weekly_access["Access Rate"] = 100 * weekly_access["Surveyed"] / ( + weekly_access["Surveyed"] + weekly_access["No Answer"] + weekly_access["Other Issue"] + ) diff --git a/etl/access_reporting/requirements.txt b/etl/access_reporting/requirements.txt new file mode 100644 index 00000000..8e6dbb08 --- /dev/null +++ b/etl/access_reporting/requirements.txt @@ -0,0 +1,11 @@ +python-docx==0.8.11 +PyPDF2==3.0.1 +boto3 +requests +pandas +pyarrow==12.0.1 +openpyxl==3.1.2 +usaddress==0.5.10 +pdfplumber==0.10.3 +msgpack==1.0.5 +msal \ No newline at end of file diff --git a/etl/customers/cottons/remote_assessments.py b/etl/customers/cottons/remote_assessments.py new file mode 100644 index 00000000..fe195f7d --- /dev/null +++ b/etl/customers/cottons/remote_assessments.py @@ -0,0 +1,102 @@ +import os +import time + +from tqdm import tqdm +import pandas as pd +from dotenv import load_dotenv +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc +from backend.SearchEpc import SearchEpc +from utils.s3 import save_csv_to_s3 + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +PORTFOLIO_ID = 121 +USER_ID = 8 + + +def app(): + """ + Prepares the inputs to produce the remote assessments for Cottons + :return: + """ + + # Read in the asset list + cottons_asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List EPC Data Pull.xlsx" + ) + # A number are missing EPCs due to the space in the postcode + # Breakdowns: + # C 119 + # D 106 + # E 26 + # B 5 + # + # Take the EPC D/E properties + asset_list = cottons_asset_list[ + cottons_asset_list["EPC rating on register"].isin(["D", "E"]) + ] + asset_list = asset_list.reset_index(drop=True) + asset_list["row_id"] = asset_list.index + asset_list["uprn"] = asset_list["uprn"].astype(int) + + extracted_data = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + add1 = home["address1"] + pc = home["postcode"] + # Retrieve the EPC data + epc_searcher = SearchEpc( + address1=add1, + postcode=pc, uprn=home["uprn"], auth_token=EPC_AUTH_TOKEN, os_api_key="" + ) + epc_searcher.find_property(skip_os=True) + + find_epc_searcher = RetrieveFindMyEpc(address=epc_searcher.newest_epc["address1"], + postcode=epc_searcher.newest_epc["postcode"]) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + time.sleep(0.5) + # We need uprn + + extracted_data.append( + { + "uprn": home["uprn"], + **find_epc_data, + } + ) + + non_invasive_recommendations = [ + { + "uprn": r["uprn"], + "recommendations": r["recommendations"] + } for r in extracted_data + ] + + filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(asset_list), + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + # Store the non-invasive recommendations in s3 + non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(non_invasive_recommendations), + bucket_name="retrofit-plan-inputs-dev", + file_name=non_invasive_recommendations_filename + ) + + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Social", + "goal": "Increasing EPC", + "goal_value": "C", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": "", + "scenario_name": "Wave 3 Packages", + "multi_plan": True, + "budget": None, + } + print(body) diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index b6394275..4db72b23 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -295,6 +295,7 @@ class RetrieveFindMyEpc: "Change room heaters to condensing boiler": ["boiler_upgrade"], "Cylinder thermostat": ["cylinder_thermostat"], "Heat recovery system for mixer showers": ["heat_recovery_shower"], + "Room-in-roof insulation": ["room_in_roof_insulation"], } survey = True diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index b53b36c2..0f3e0068 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -30,9 +30,12 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): postcode = home[postcode_column] house_number = home[address1_column] full_address = home[fulladdress_column] + house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode) + if house_no is None: + house_no = house_number searcher = SearchEpc( - address1=str(house_number), + address1=str(house_no), postcode=postcode, auth_token=EPC_AUTH_TOKEN, os_api_key="", @@ -46,6 +49,34 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): searcher.ordnance_survey_client.built_form = None searcher.find_property(skip_os=True) + + # Check if we have a flat or appartment + if searcher.newest_epc is None: + # Try again: + if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None: + # Backup + add1 = full_address.split(",")[1].strip() + else: + add1 = str(house_number) + searcher = SearchEpc( + address1=add1, + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + + if ( + "flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in + house_number.lower() + ): + searcher.ordnance_survey_client.property_type = "Flat" + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: no_epc.append(home["row_id"]) continue From 477504abd136c17aaca4ba0ab8757d59bdf84e0a Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 3 Dec 2024 19:08:18 +0000 Subject: [PATCH 18/23] adding non-intrusive sap points and survey flag pick up for multiple recommendations --- backend/Property.py | 12 ++++++++++++ backend/app/assumptions.py | 1 + etl/customers/cottons/remote_assessments.py | 11 ++++++++++- recommendations/DraughtProofingRecommendations.py | 5 ++++- recommendations/HeatingRecommender.py | 13 ++++++++++++- recommendations/HotwaterRecommendations.py | 9 +++++---- recommendations/RoofRecommendations.py | 10 ++++++++-- recommendations/WallRecommendations.py | 10 ++++++++-- 8 files changed, 60 insertions(+), 11 deletions(-) diff --git a/backend/Property.py b/backend/Property.py index 31f207ab..cc5bf12b 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -426,6 +426,18 @@ class Property: if phase_epc_transformation[k] == v: continue + if k == "hotwater-description": + if ( + v == "From main system" + ) and ( + phase_epc_transformation["mainheat-description"] == "Electric storage heaters" + ) and ( + "Electric immersion" in phase_epc_transformation["hotwater-description"] + ): + # It means we've recommended HHR with electric immersion, and shouldn't overwrite + # the hot water description + continue + raise NotImplementedError( "Already have this key in the phase_epc_transformation - implement me" ) diff --git a/backend/app/assumptions.py b/backend/app/assumptions.py index 79f2a087..44838a47 100644 --- a/backend/app/assumptions.py +++ b/backend/app/assumptions.py @@ -50,4 +50,5 @@ DESCRIPTIONS_TO_FUEL_TYPES = { }, "Gas instantaneous at point of use": {"fuel": "Natural Gas", "cop": 0.85}, "Room heaters, wood logs": {"fuel": "Wood Logs", "cop": 1}, + "Boiler and radiators, coal": {"fuel": "Coal", "cop": 0.85}, } diff --git a/etl/customers/cottons/remote_assessments.py b/etl/customers/cottons/remote_assessments.py index fe195f7d..6ac895f1 100644 --- a/etl/customers/cottons/remote_assessments.py +++ b/etl/customers/cottons/remote_assessments.py @@ -40,6 +40,7 @@ def app(): asset_list["uprn"] = asset_list["uprn"].astype(int) extracted_data = [] + model_asset_list = [] for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): add1 = home["address1"] pc = home["postcode"] @@ -63,6 +64,14 @@ def app(): } ) + model_asset_list.append( + { + "uprn": home["uprn"], + "address": epc_searcher.newest_epc["address1"], + "postcode": epc_searcher.newest_epc["postcode"], + } + ) + non_invasive_recommendations = [ { "uprn": r["uprn"], @@ -72,7 +81,7 @@ def app(): filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv" save_csv_to_s3( - dataframe=pd.DataFrame(asset_list), + dataframe=pd.DataFrame(model_asset_list), bucket_name="retrofit-plan-inputs-dev", file_name=filename ) diff --git a/recommendations/DraughtProofingRecommendations.py b/recommendations/DraughtProofingRecommendations.py index 4bd85a03..a16a94f6 100644 --- a/recommendations/DraughtProofingRecommendations.py +++ b/recommendations/DraughtProofingRecommendations.py @@ -26,6 +26,9 @@ class DraughtProofingRecommendations: if not draught_proofing_recommendation_config: return + # Cost is based on a £50 cost per window, based on Checkatrade + cost = draught_proofing_recommendation_config.get("cost", self.property.number_of_windows * 50) + description = ( "Draught proof doors and windows to improve energy efficiency" if not draught_proofing_recommendation_config.get("description") @@ -48,7 +51,7 @@ class DraughtProofingRecommendations: "kwh_savings": 0, "co2_equivalent_savings": 0, "energy_cost_savings": 0, - "total": draught_proofing_recommendation_config["cost"], + "total": cost, # We use a very simple and rough estimate of 4 hours per unit "labour_hours": draught_proofing_recommendation_config.get("labour_hours", 8), "labour_days": draught_proofing_recommendation_config.get("labour_days", 1), # Assume 8 hour day diff --git a/recommendations/HeatingRecommender.py b/recommendations/HeatingRecommender.py index 7dc4f8b2..a4443bad 100644 --- a/recommendations/HeatingRecommender.py +++ b/recommendations/HeatingRecommender.py @@ -1,5 +1,6 @@ import re import backend.app.assumptions as assumptions +from etl.customers.immo.pilot.asset_list import non_invasive_recommendations from recommendations.Costs import Costs, BOILER_UPGRADE_SCHEME_ASHP_VALUE from recommendations.recommendation_utils import ( check_simulation_difference, override_costs, combine_recommendation_configs @@ -981,6 +982,10 @@ class HeatingRecommender: self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor", "Average"] ) + non_invasive_recommendation = next(( + r for r in self.property.non_invasive_recommendations if r["type"] == "boiler_upgrade" + ), {}) + if has_inefficient_space_heating or has_inefficient_water: boiler_size = self.estimate_boiler_size( property_type=self.property.data["property-type"], @@ -1079,12 +1084,13 @@ class HeatingRecommender: "description": description, "starting_u_value": None, "new_u_value": None, - "sap_points": None, + "sap_points": non_invasive_recommendation.get("sap_points", None), "already_installed": already_installed, "simulation_config": simulation_config, "description_simulation": description_simulation, **boiler_costs, "system_type": "boiler_upgrade", + "survey": non_invasive_recommendation.get("survey", None) } # We recommend the heating controls @@ -1105,6 +1111,11 @@ class HeatingRecommender: if not controls_recommender.recommendation and not boiler_recommendation: return + # If this is true, we set SAP points to None and survey to False for the boiler recommendation + if boiler_recommendation: + boiler_recommendation["sap_points"] = None + boiler_recommendation["survey"] = False + if not system_change and len(boiler_recommendation): # If there is not a system change, we add the boiler recommendation at point. self.heating_recommendations.extend([boiler_recommendation]) diff --git a/recommendations/HotwaterRecommendations.py b/recommendations/HotwaterRecommendations.py index b86329e4..d8404cc1 100644 --- a/recommendations/HotwaterRecommendations.py +++ b/recommendations/HotwaterRecommendations.py @@ -20,6 +20,8 @@ class HotwaterRecommendations: :return: """ # Reset the recommendations + recommendations_phase = phase + self.recommendations = [] non_invasive_recommendations = self.property.non_invasive_recommendations if non_invasive_recommendations: @@ -28,7 +30,6 @@ class HotwaterRecommendations: r["type"] in ["hot_water_tank_insulation", "cylinder_thermostat"] ] - recommendations_phase = phase for m in measures: non_invasive_rec = [ r for r in non_invasive_recommendations if r["type"] == m @@ -55,7 +56,7 @@ class HotwaterRecommendations: if self.property.hotwater["clean_description"] == "Gas boiler/circulator, no cylinder thermostat": # Handle this case specifically: - self.recommend_cylinder_thermostat_gas_boiler_circulator(phase=phase) + self.recommend_cylinder_thermostat_gas_boiler_circulator(phase=recommendations_phase) return # If there is no system present, but access to the mains, we @@ -68,14 +69,14 @@ class HotwaterRecommendations: (self.property.hotwater["no_system_present"] is None) & (len(has_tank_recommendation) == 0) ): - self.recommend_tank_insulation(phase=phase) + self.recommend_tank_insulation(phase=recommendations_phase) return has_cylinder_recommendation = [r for r in self.recommendations if r["type"] == "cylinder_thermostat"] if ((self.property.hotwater["clean_description"] == "From main system, no cylinder thermostat") & (len(has_cylinder_recommendation) == 0)): - self.recommend_cylinder_thermostat(phase=phase) + self.recommend_cylinder_thermostat(phase=recommendations_phase) return def recommend_tank_insulation(self, phase, sap_points=None, survey=False, _return=False): diff --git a/recommendations/RoofRecommendations.py b/recommendations/RoofRecommendations.py index 51264b75..4e29083f 100644 --- a/recommendations/RoofRecommendations.py +++ b/recommendations/RoofRecommendations.py @@ -290,6 +290,11 @@ class RoofRecommendations: insulation_materials = pd.DataFrame(insulation_materials) + non_invasive_recommendations = next( + (r for r in self.property.non_invasive_recommendations if + r["type"] == insulation_materials["type"].values[0]), {} + ) + lowest_selected_u_value = None recommendations = [] for _, insulation_material_group in insulation_materials.groupby("description"): @@ -429,14 +434,15 @@ class RoofRecommendations: "description": self.make_roof_insulation_description(material), "starting_u_value": u_value, "new_u_value": new_u_value, - "sap_points": None, + "sap_points": non_invasive_recommendations.get("sap_points", 0), "already_installed": already_installed, "simulation_config": simulation_config, "description_simulation": { "roof-description": new_description, "roof-energy-eff": new_efficiency }, - **cost_result + **cost_result, + "survey": non_invasive_recommendations.get("survey", False) } ) diff --git a/recommendations/WallRecommendations.py b/recommendations/WallRecommendations.py index f77ae5a0..92147fb8 100644 --- a/recommendations/WallRecommendations.py +++ b/recommendations/WallRecommendations.py @@ -385,6 +385,11 @@ class WallRecommendations(Definitions): if insulation_thickness == "below average": cavity_width = cavity_width * (1 - PARTIALLY_FILLED_PERCENTAGE_ASSUMPTION) + non_invasive_recommendations = next( + (r for r in self.property.non_invasive_recommendations if + r["type"] == insulation_materials["type"].values[0]), {} + ) + # Test the different fill options lowest_selected_u_value = None recommendations = [] @@ -475,14 +480,15 @@ class WallRecommendations(Definitions): "description": description, "starting_u_value": u_value, "new_u_value": new_u_value, - "sap_points": None, + "sap_points": non_invasive_recommendations.get("sap_points", None), "already_installed": already_installed, "simulation_config": simulation_config, "description_simulation": { "walls-description": "Cavity wall, filled cavity", "walls-energy-eff": "Good" }, - **cost_result + **cost_result, + "survey": non_invasive_recommendations.get("survey", False) } ) From 703c4e3ac1deacbb48ec1a2432c1a0c0b631c980 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 5 Dec 2024 13:45:19 +0000 Subject: [PATCH 19/23] adding additional coverage to RetrieveFindMyEpc --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- backend/app/plan/router.py | 3 +- etl/customers/cottons/remote_assessments.py | 19 +++++++++++-- etl/find_my_epc/RetrieveFindMyEpc.py | 14 +++++++++- etl/route_march_data_pull/app.py | 31 +++++++++++++-------- recommendations/HeatingRecommender.py | 7 ++--- recommendations/Recommendations.py | 12 ++++++-- 8 files changed, 65 insertions(+), 25 deletions(-) diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 4a5b3bd4..dbef6435 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -366,7 +366,7 @@ def extract_property_request_data( property_non_invasive_recommendations["recommendations"] = str(transformed) property_valution = next(( - float(x["value"]) for x in valuation_data if + float(x["valuation"]) for x in valuation_data if (str(x["uprn"]) == str(uprn)) ), None) @@ -611,6 +611,7 @@ async def trigger_plan(body: PlanTriggerRequest): property_instance=property_instance, all_predictions=all_predictions, recommendations=recommendations, + representative_recommendations=representative_recommendations ) ) diff --git a/etl/customers/cottons/remote_assessments.py b/etl/customers/cottons/remote_assessments.py index 6ac895f1..7855a1a9 100644 --- a/etl/customers/cottons/remote_assessments.py +++ b/etl/customers/cottons/remote_assessments.py @@ -10,8 +10,8 @@ from utils.s3 import save_csv_to_s3 load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") -PORTFOLIO_ID = 121 USER_ID = 8 +PORTFOLIO_ID = 121 def app(): @@ -22,7 +22,8 @@ def app(): # Read in the asset list cottons_asset_list = pd.read_excel( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List EPC Data Pull.xlsx" + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List EPC Data Pull with " + "valuations.xlsx" ) # A number are missing EPCs due to the space in the postcode # Breakdowns: @@ -79,6 +80,9 @@ def app(): } for r in extracted_data ] + valuations_data = asset_list[["uprn", "Zoopla Valuation"]].copy().rename(columns={"Zoopla Valuation": "valuation"}) + valuations_data = valuations_data[~pd.isnull(valuations_data["valuation"])] + filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv" save_csv_to_s3( dataframe=pd.DataFrame(model_asset_list), @@ -94,6 +98,14 @@ def app(): file_name=non_invasive_recommendations_filename ) + # Store the valuations data in s3 + valuations_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuations.csv" + save_csv_to_s3( + dataframe=valuations_data, + bucket_name="retrofit-plan-inputs-dev", + file_name=valuations_filename + ) + body = { "portfolio_id": str(PORTFOLIO_ID), "housing_type": "Social", @@ -103,9 +115,10 @@ def app(): "already_installed_file_path": "", "patches_file_path": "", "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, - "valuation_file_path": "", + "valuation_file_path": valuations_filename, "scenario_name": "Wave 3 Packages", "multi_plan": True, "budget": None, + "exclusions": ['air_source_heat_pump', 'boiler_upgrade', 'floor_insulation'] } print(body) diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index 4db72b23..3dd486b3 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -282,7 +282,8 @@ class RetrieveFindMyEpc: "Low energy lighting for all fixed outlets": ["low_energy_lighting"], "Cylinder thermostat recommendation": [], "Heating controls recommendation": [], - "Replace boiler with Band A condensing boiler": [], + "Replace boiler with Band A condensing boiler": ["boiler_upgrade"], + "Band A condensing gas boiler": ["boiler_upgrade"], "Solar panel recommendation": [], "Double glazing recommendation": [], "Solid wall insulation recommendation": [], @@ -296,6 +297,17 @@ class RetrieveFindMyEpc: "Cylinder thermostat": ["cylinder_thermostat"], "Heat recovery system for mixer showers": ["heat_recovery_shower"], "Room-in-roof insulation": ["room_in_roof_insulation"], + "Fan assisted storage heaters": [], + "Fan-assisted storage heaters": [], + "Step 1:": [], + "Biomass stove with boiler": [], + "Replace boiler with biomass boiler": [], + "Heating controls (room thermostat and thermostatic radiator valves)": [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ], + "Heating controls (programmer, and thermostatic radiator valves)": [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ], } survey = True diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 0f3e0068..11dd19b8 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -25,6 +25,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): epc_data = [] errors = [] no_epc = [] + # home = asset_list[asset_list["row_id"] == errors[15]].squeeze() for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): try: postcode = home[postcode_column] @@ -94,7 +95,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): ) find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() except ValueError as e: - if "No EPC found" in str(e): + if "No EPC found" in str(e) and "address1" in searcher.newest_epc: find_epc_searcher = RetrieveFindMyEpc( address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"] ) @@ -151,17 +152,17 @@ def app(): Property UPRN """ - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/" - DATA_FILENAME = "Cottons Asset List.xlsx" - SHEET_NAME = "Sheet1" - POSTCODE_COLUMN = "postcode" - FULLADDRESS_COLUMN = "Property Address" - ADDRESS1_COLUMN = "address1" - ADDRESS1_METHOD = None - ADDRESS_COLS_TO_CONCAT = [] + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford" + DATA_FILENAME = "BROMFORD - SOLAR PV ROOFs INSPECTED - Electric only properties getting to C list.xlsx" + SHEET_NAME = "MAIN" + POSTCODE_COLUMN = "Post Code" + FULLADDRESS_COLUMN = "Full Address" + ADDRESS1_COLUMN = None + ADDRESS1_METHOD = "first_two_words" + ADDRESS_COLS_TO_CONCAT = ["House No", "Street", "District"] asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) - # asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index() + asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index() asset_list["row_id"] = asset_list.index # We clean up portential non-breaking spaces, and double spaces @@ -249,6 +250,8 @@ def app(): [ "row_id", "uprn", + "address1", + "postcode", "property-type", "built-form", "inspection-date", @@ -256,6 +259,7 @@ def app(): "current-energy-efficiency", "roof-description", "walls-description", + "floor-description", "transaction-type", # New fields needed "secondheat-description", @@ -268,7 +272,7 @@ def app(): "energy-consumption-current", # kwh/m2 "photo-supply", ] - ] + ].rename(columns={"address1": "Address1 on EPC", "postcode": "Postcode on EPC"}) asset_list = asset_list.merge( epc_df, @@ -308,6 +312,7 @@ def app(): "number-habitable-rooms": "Number of Habitable Rooms", "walls-description": "Wall Construction", "roof-description": "Roof Construction", + "floor-description": "Floor Construction", "mainheat-description": "Heating Type", "secondheat-description": "Secondary Heating", "transaction-type": "Reason for last EPC", @@ -363,3 +368,7 @@ def app(): # Store as an excel filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx" asset_list.to_excel(filename, index=False) + + matches_review = asset_list[ + [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address1 on EPC", "Postcode on EPC"] + ] diff --git a/recommendations/HeatingRecommender.py b/recommendations/HeatingRecommender.py index a4443bad..1eab7d42 100644 --- a/recommendations/HeatingRecommender.py +++ b/recommendations/HeatingRecommender.py @@ -1111,17 +1111,14 @@ class HeatingRecommender: if not controls_recommender.recommendation and not boiler_recommendation: return - # If this is true, we set SAP points to None and survey to False for the boiler recommendation - if boiler_recommendation: - boiler_recommendation["sap_points"] = None - boiler_recommendation["survey"] = False - if not system_change and len(boiler_recommendation): # If there is not a system change, we add the boiler recommendation at point. self.heating_recommendations.extend([boiler_recommendation]) if system_change: # We combine the heating and controls recommendations, in the case of a system change + # If this is true, we set SAP points to None and survey to False for the boiler recommendation + combined_recommendations = [] for controls_recommendation in controls_recommender.recommendation: combined_recommendation = self.combine_heating_and_controls( diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index ed6a8526..189581d8 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -311,7 +311,7 @@ class Recommendations: continue has_u_value = recommendations_by_type[0].get("new_u_value") is not None - has_sap_points = recommendations_by_type[0].get("sap_points") is not None + has_sap_points = all([r.get("sap_points") is not None for r in recommendations_by_type]) has_rank = recommendations_by_type[0].get("rank") is not None # When check if these recommendations have two different types, such as solid wall insulation @@ -449,6 +449,7 @@ class Recommendations: property_instance, all_predictions, recommendations, + representative_recommendations, ): """ @@ -473,6 +474,9 @@ class Recommendations: property_recommendations = recommendations[property_instance.id].copy() + representative_recs = representative_recommendations[property_instance.id].copy() + representative_ids = [r["recommendation_id"] for r in representative_recs] + increasing_variables = ["sap"] decreasing_variables = ["carbon", "heat_demand"] @@ -530,7 +534,9 @@ class Recommendations: else: - previous_phase_values_multiple = [x for x in impact_summary if x["phase"] == (rec["phase"] - 1)] + previous_phase_values_multiple = [ + x for x in impact_summary if x["phase"] == (rec["phase"] - 1) and x["representative"] + ] if len(previous_phase_values_multiple) != 1: # Take an average of each of the previous phases keys_to_median = ["sap", "carbon", "heat_demand"] @@ -628,7 +634,9 @@ class Recommendations: impact_summary.append( { "phase": rec["phase"], + "representative": rec["recommendation_id"] in representative_ids, "recommendation_id": rec["recommendation_id"], + "measure_type": rec["measure_type"], **current_phase_values } ) From c41891f0faaaf19753d6f5a2279918718ad34e29 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 5 Dec 2024 14:20:43 +0000 Subject: [PATCH 20/23] adding manual uprn override --- etl/route_march_data_pull/app.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 11dd19b8..3c8cfa31 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -21,7 +21,7 @@ load_dotenv(dotenv_path="backend/.env") EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") -def get_data(asset_list, fulladdress_column, address1_column, postcode_column): +def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map): epc_data = [] errors = [] no_epc = [] @@ -34,6 +34,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode) if house_no is None: house_no = house_number + uprn = manual_uprn_map.get(full_address, None) searcher = SearchEpc( address1=str(house_no), @@ -43,7 +44,8 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): property_type=None, fast=True, full_address=full_address, - max_retries=5 + max_retries=5, + uprn=uprn ) # Force the skipping of estimating the EPC searcher.ordnance_survey_client.property_type = None @@ -52,7 +54,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column): searcher.find_property(skip_os=True) # Check if we have a flat or appartment - if searcher.newest_epc is None: + if searcher.newest_epc is None and uprn is None: # Try again: if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None: # Backup @@ -161,6 +163,13 @@ def app(): ADDRESS1_METHOD = "first_two_words" ADDRESS_COLS_TO_CONCAT = ["House No", "Street", "District"] + # Maps addresses to uprn in problematic cases + MANUAL_UPRN_MAP = { + "1 Ivy Court, The Gardens, Erdington, Birmingham": 100071442178, + "8 Ivy Court, The Gardens, Erdington, Birmingham": 10033393299, + "7 Ivy Court, The Gardens, Erdington, Birmingham": 100071442184, + } + asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index() asset_list["row_id"] = asset_list.index @@ -193,7 +202,8 @@ def app(): asset_list=asset_list, fulladdress_column=FULLADDRESS_COLUMN, address1_column=ADDRESS1_COLUMN, - postcode_column=POSTCODE_COLUMN + postcode_column=POSTCODE_COLUMN, + manual_uprn_map=MANUAL_UPRN_MAP ) # We now retrieve any failed properties From 3c98cfa7cc31fa4db43a318726afe85d830775e7 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 10 Dec 2024 17:02:59 +0000 Subject: [PATCH 21/23] reviewing stonewater assigned packages --- etl/customers/gla/hug_postcodes.py | 4 + .../stonewater/Wave 3 Preparation.py | 125 ++++++++++++++++++ .../whlg eligibile properties.py | 77 +++++++++++ etl/find_my_epc/RetrieveFindMyEpc.py | 1 + etl/route_march_data_pull/app.py | 35 +++-- 5 files changed, 224 insertions(+), 18 deletions(-) create mode 100644 etl/customers/waltham_forest/whlg eligibile properties.py diff --git a/etl/customers/gla/hug_postcodes.py b/etl/customers/gla/hug_postcodes.py index ac2d1e3c..fc89b6f2 100644 --- a/etl/customers/gla/hug_postcodes.py +++ b/etl/customers/gla/hug_postcodes.py @@ -54,6 +54,10 @@ for directory in tqdm(epc_directories): & (off_gas["is_heritage_building"] != True) ] + off_gas = off_gas[ + off_gas["tenure"].isin(["rental (private)", "Rented (private)", "owner-occupied", "Owner-occupied"]) + ] + region_summary = off_gas.groupby("postal_region").size().reset_index(name="count") aggregation.append(region_summary) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 77200e69..bd36d782 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2607,5 +2607,130 @@ def propsed_wave_3_sample(): len({v for v in units_in_bid if str(v) in u_aids}) len(list(set(units_in_bid))) + +def identify_incorrect_pacakges(): + """ + Due to limitations in the data collected during survey, we have some properties that do not have suitable packages + assigned. This function will identify those properties, which can be flagged for Stonewater's review + """ + + units_with_assigned_packages = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.20 V2.xlsx"), + header=2, + sheet_name="Individual Units Programme" + ) + + # This sheet contains information on the heating systems for properties, so we can flag any units that have + # been labelled as being electric but are actually gas + heating_survey_data = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "STOCKBOOK December 2024 data (5).xlsx"), + header=0, + sheet_name="Export" + ) + + units_with_assigned_packages = units_with_assigned_packages.merge( + heating_survey_data[["Asset Reference", "Heating Type"]], how="left", + left_on="Org. ref.", right_on="Asset Reference" + ) + + # Check the different heating types + units_with_assigned_packages["Gas properties: different to Parity"] = ( + (units_with_assigned_packages["Heating Type"].isin(["Gas", "Communal Gas"])) & ( + units_with_assigned_packages["Heating"].isin( + [ + "Heat Pump: Electric Heat " + "pumps: Air source heat pump " + "with flow temperature <= 35°C", + "Electric Storage Systems: Fan " + "storage heaters", + "Electric (direct acting) room " + "heaters: Panel, convector or " + "radiant heaters" + ] + ) + ) + ) + + units_with_assigned_packages["Electric properties: different to Parity"] = ( + (units_with_assigned_packages["Heating Type"] == "Electric") & ( + units_with_assigned_packages["Heating"].isin( + [ + "Boiler: A rated Regular Boiler", + "Boiler: F rated Combi", + "No Heating", + "Boiler: A rated CPSU", + "Boiler: G rated Regular Boiler" + ] + ) + ) + ) + + units_with_assigned_packages["Ground Source properties: different to Parity"] = ( + (units_with_assigned_packages["Heating Type"] == "Ground Source") & ( + units_with_assigned_packages["Heating"].isin( + [ + "Heat Pump: Electric Heat pumps: Air source heat pump with flow temperature <= 35°C", + "Electric Storage Systems: Fan storage heaters", + "Electric Storage Systems: High heat retention storage heaters" + ] + ) + ) + ) + + units_with_assigned_packages["LPG properties: different to Parity"] = ( + (units_with_assigned_packages["Heating Type"] == "Lpg") & ( + units_with_assigned_packages["Main Fuel"].isin( + [ + "Gas: Mains Gas", "Solid Fuel: Wood Logs, Gas: Mains Gas" + ] + ) + ) + ) + + units_with_assigned_packages["Solid Fuel properties: different to Parity"] = ( + (units_with_assigned_packages["Heating Type"] == "Solid Fuel") & ( + units_with_assigned_packages["Main Fuel"].isin( + [ + "Gas: Mains Gas" + ] + ) + ) + ) + + # The next check is to identify properties with specific features that are not condusive to specific packages. E.g. + # Solar PV packages for properties that have another dwelling above + + z = units_with_assigned_packages[ + units_with_assigned_packages["Package Ref"].isin( + [ + "3A", "3B", "4", 4 + ] + ) + ] + z["Roof Type"].value_counts() + z["Survey: Main Roof Type"].value_counts() + + z[z["Survey: Main Roof Type"].str.contains("A Another dwelling above")][ + "Survey: Matching Address ID"].value_counts() + + zz = z[z["Survey: Main Roof Type"].str.contains("A Another dwelling above")][ + ["Survey: Matching Address ID", "Survey: Org. ref.", "Survey: Main Roof Type"] + ].drop_duplicates() + zz = zz.sort_values("Survey: Matching Address ID") + zz.to_csv(os.path.join(CUSTOMER_FOLDER_PATH, "3A, 3B or 4 Packages with a dwelling above.csv"), index=False) + + z[z["Survey: Main Roof Type"].str.contains("A Another dwelling above")]["Package Ref"].value_counts() + + # Label properties that have been matched to a package, during coordination, that includes Solar PV and has + # a property with a dwelling above + units_with_assigned_packages["Invalid Roof Type for Solar - coordination to be reviewed"] = ( + (units_with_assigned_packages["Package Ref"].isin(["3A", "3B", "4", 4])) & ( + units_with_assigned_packages["Survey: Main Roof Type"].str.contains("A Another dwelling above") + ) + ) + + # Label properties that have a dwelling above in the Parity data, and weren't surveyed, but have been assigned + # a package that includes solar PV + # if __name__ == "__main__": # main() diff --git a/etl/customers/waltham_forest/whlg eligibile properties.py b/etl/customers/waltham_forest/whlg eligibile properties.py new file mode 100644 index 00000000..fee988c1 --- /dev/null +++ b/etl/customers/waltham_forest/whlg eligibile properties.py @@ -0,0 +1,77 @@ +""" +This is the list of properties, based on the EPC data, that look eligible for WHLG +""" +import pandas as pd +from etl.epc.settings import EARLIEST_EPC_DATE +from etl.spatial.OpenUprnClient import OpenUprnClient + +epc_data = pd.read_csv( + "/Users/khalimconn-kowlessar/Downloads/all-domestic-certificates/domestic-E09000031-Waltham-Forest/certificates.csv" +) +epc_data.columns = [c.replace("_", "-").lower() for c in epc_data.columns] +epc_data = epc_data[epc_data["lodgement-date"] >= EARLIEST_EPC_DATE] + +epc_data = epc_data[~pd.isnull(epc_data["uprn"])] +epc_data["uprn"] = epc_data["uprn"].astype(int) + +epc_data = epc_data[epc_data["current-energy-rating"].isin(["D", "E", "F", "G"])] +epc_data = epc_data[epc_data["tenure"].isin( + ["rental (private)", "Rented (private)", "owner-occupied", "Owner-occupied"]) +] + +whlg_eligible_postcodes = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/WHLG-eligible-postcodes.xlsx", + sheet_name="Eligible postcodes", + header=1 +) +# Format: +whlg_eligible_postcodes = whlg_eligible_postcodes[['Postcode', 'Local Authority']] + +uprns = epc_data["uprn"].unique() +# Get data +ca_data = OpenUprnClient.get_spatial_data(uprns, "retrofit-data-dev") +epc_data = epc_data.merge( + ca_data[["UPRN", "conservation_status", "is_listed_building", "is_heritage_building"]].rename( + columns={"UPRN": "uprn"} + ), + how="left", + on="uprn", +) + +epc_data["has_conservation_restrictions"] = ( + (epc_data["conservation_status"] == True) + | (epc_data["is_listed_building"] == True) + | (epc_data["is_heritage_building"] == True) +) + +# Pathway 1: +# Match based on eligible postcodes +pathway1 = epc_data[epc_data["postcode"].isin(whlg_eligible_postcodes["Postcode"].values)] +pathway1 = pathway1[ + [ + "uprn", "address", "address1", "postcode", "current-energy-rating", "current-energy-efficiency", + "lodgement-date", + "has_conservation_restrictions", "walls-description", "roof-description", "mainheat-description" + ] +] + +pathway1 = pathway1.rename( + columns={ + "current-energy-rating": "EPC Rating", "current-energy-efficiency": "SAP Score", + "lodgement-date": "EPC Date", "has_conservation_restrictions": "Conservation Area Restrictions", + "walls-description": "Wall Type", "roof-description": "Roof Type", "mainheat-description": "Main Heating" + } +) + +pathway1["EPC Date"] = pd.to_datetime(pathway1["EPC Date"]).dt.strftime("%Y-%m-%d") +# Create a year EPC was lodged +pathway1["EPC Year"] = pd.to_datetime(pathway1["EPC Date"]).dt.year + +pathway1.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Waltham Forest WHLG - Pathway 1 Eligibility.csv", + index=False +) + +# Pathway 2 or 3 +# The household will need to be means tested +pathway2 = epc_data[~epc_data["uprn"].isin(pathway1["uprn"].values)] diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index 3dd486b3..5ea35a64 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -308,6 +308,7 @@ class RetrieveFindMyEpc: "Heating controls (programmer, and thermostatic radiator valves)": [ "roomstat_programmer_trvs", "time_temperature_zone_control" ], + "Replacement warm air unit": [] } survey = True diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 3c8cfa31..9ed55185 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -25,7 +25,7 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column, m epc_data = [] errors = [] no_epc = [] - # home = asset_list[asset_list["row_id"] == errors[15]].squeeze() + # home = asset_list[asset_list["row_id"] == errors[5]].squeeze() for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): try: postcode = home[postcode_column] @@ -154,21 +154,17 @@ def app(): Property UPRN """ - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford" - DATA_FILENAME = "BROMFORD - SOLAR PV ROOFs INSPECTED - Electric only properties getting to C list.xlsx" - SHEET_NAME = "MAIN" - POSTCODE_COLUMN = "Post Code" - FULLADDRESS_COLUMN = "Full Address" - ADDRESS1_COLUMN = None - ADDRESS1_METHOD = "first_two_words" - ADDRESS_COLS_TO_CONCAT = ["House No", "Street", "District"] + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Watford" + DATA_FILENAME = "JS Mailing List 10122024.xlsx" + SHEET_NAME = "Export" + POSTCODE_COLUMN = "Postcode" + FULLADDRESS_COLUMN = "Property Address" + ADDRESS1_COLUMN = "Address Line 1" + ADDRESS1_METHOD = None + ADDRESS_COLS_TO_CONCAT = [] # Maps addresses to uprn in problematic cases - MANUAL_UPRN_MAP = { - "1 Ivy Court, The Gardens, Erdington, Birmingham": 100071442178, - "8 Ivy Court, The Gardens, Erdington, Birmingham": 10033393299, - "7 Ivy Court, The Gardens, Erdington, Birmingham": 100071442184, - } + MANUAL_UPRN_MAP = {} asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index() @@ -197,6 +193,7 @@ def app(): # Drop the dupes print(f"There are {asset_list['deduper'].duplicated().sum()} duplicated addresses - dropping") asset_list = asset_list[~asset_list["deduper"].duplicated()] + asset_list = asset_list.drop(columns=["deduper"]) epc_data, errors, no_epc = get_data( asset_list=asset_list, @@ -212,7 +209,8 @@ def app(): asset_list=asset_list_failed, fulladdress_column=FULLADDRESS_COLUMN, address1_column=ADDRESS1_COLUMN, - postcode_column=POSTCODE_COLUMN + postcode_column=POSTCODE_COLUMN, + manual_uprn_map=MANUAL_UPRN_MAP ) # Append the failed data to the main data @@ -261,6 +259,7 @@ def app(): "row_id", "uprn", "address1", + "address", "postcode", "property-type", "built-form", @@ -282,7 +281,7 @@ def app(): "energy-consumption-current", # kwh/m2 "photo-supply", ] - ].rename(columns={"address1": "Address1 on EPC", "postcode": "Postcode on EPC"}) + ].rename(columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"}) asset_list = asset_list.merge( epc_df, @@ -376,9 +375,9 @@ def app(): asset_list = asset_list.drop(columns=["row_id"]) # Store as an excel - filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx" + filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull - Main.xlsx" asset_list.to_excel(filename, index=False) matches_review = asset_list[ - [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address1 on EPC", "Postcode on EPC"] + [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"] ] From 2b7ca82d09aea93737d5c93cb0619c55aba71063 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Tue, 10 Dec 2024 18:55:30 +0000 Subject: [PATCH 22/23] creating checking code for Stonewater --- .../stonewater/Wave 3 Preparation.py | 76 +++++++++++++------ 1 file changed, 54 insertions(+), 22 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index bd36d782..4e336f23 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2699,28 +2699,6 @@ def identify_incorrect_pacakges(): # The next check is to identify properties with specific features that are not condusive to specific packages. E.g. # Solar PV packages for properties that have another dwelling above - - z = units_with_assigned_packages[ - units_with_assigned_packages["Package Ref"].isin( - [ - "3A", "3B", "4", 4 - ] - ) - ] - z["Roof Type"].value_counts() - z["Survey: Main Roof Type"].value_counts() - - z[z["Survey: Main Roof Type"].str.contains("A Another dwelling above")][ - "Survey: Matching Address ID"].value_counts() - - zz = z[z["Survey: Main Roof Type"].str.contains("A Another dwelling above")][ - ["Survey: Matching Address ID", "Survey: Org. ref.", "Survey: Main Roof Type"] - ].drop_duplicates() - zz = zz.sort_values("Survey: Matching Address ID") - zz.to_csv(os.path.join(CUSTOMER_FOLDER_PATH, "3A, 3B or 4 Packages with a dwelling above.csv"), index=False) - - z[z["Survey: Main Roof Type"].str.contains("A Another dwelling above")]["Package Ref"].value_counts() - # Label properties that have been matched to a package, during coordination, that includes Solar PV and has # a property with a dwelling above units_with_assigned_packages["Invalid Roof Type for Solar - coordination to be reviewed"] = ( @@ -2731,6 +2709,60 @@ def identify_incorrect_pacakges(): # Label properties that have a dwelling above in the Parity data, and weren't surveyed, but have been assigned # a package that includes solar PV + units_with_assigned_packages["Invalid Roof Type for Solar - coordination to be reviewed"] = ( + (units_with_assigned_packages["Package Ref"].isin(["3A", "3B", "4", 4])) & ( + units_with_assigned_packages["Survey: Main Roof Type"].str.contains("A Another dwelling above") + ) + ) + + # We now iterate through postcodes and find anomalous properties based on the partiy data and survey data + fields_to_check = [ + 'Wall Type', 'Roof Type', 'Heating', 'Main Fuel', + 'Survey: Main Wall Type', + 'Survey: Main Roof Type', 'Survey: Primary Heating System' + ] + # Create an empty dictionary to store results + aggregated_results = {} + + units_with_assigned_packages['Wall Type'] = units_with_assigned_packages['Wall Type'].str.replace( + r'\s*\(.*?\)', '', regex=True + ) + + def check_mixed_types(row): + # Count distinct primary types with non-zero values + primary_types_present = set() + for col in field_counts.columns: + if ':' in col: + primary_type = col.split(':')[0] + if row[col] > 0: # Non-zero count means this type is present + primary_types_present.add(primary_type) + return len(primary_types_present) > 1 # True if more than one primary type + + # Process each field + for field in fields_to_check: + # Group by postcode and count occurrences of each unique value + field_counts = ( + units_with_assigned_packages.groupby(['Postcode', field]) + .size() + .unstack(fill_value=0) + .reset_index() + ) + + # Calculate dominant value and percentage before modifying the DataFrame + dominant_value = field_counts.iloc[:, 1:].idxmax(axis=1) + dominant_percentage = ( + (field_counts.iloc[:, 1:].max(axis=1) / field_counts.iloc[:, 1:].sum(axis=1)) * 100 + ) + number_of_properties = field_counts.iloc[:, 1:].sum(axis=1) + + # Add these as new columns after computation + field_counts['Dominant Value'] = dominant_value + field_counts['% Dominant'] = dominant_percentage + field_counts['Number of Properties'] = number_of_properties + field_counts['Mixed Type'] = field_counts.apply(check_mixed_types, axis=1) + + # Store the result in the dictionary + aggregated_results[field] = field_counts # if __name__ == "__main__": # main() From 000fe4dabb576e5ff22fe36a3bc0cdab46139f46 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Fri, 13 Dec 2024 09:34:15 +0000 Subject: [PATCH 23/23] minor stonewater work --- .../stonewater/Wave 3 Preparation.py | 182 ++++++++++++++++-- 1 file changed, 161 insertions(+), 21 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 4e336f23..d2232f40 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -6,6 +6,8 @@ import numpy as np from tqdm import tqdm from collections import Counter from scipy.optimize import linprog + +from SearchEpc import SearchEpc from utils.s3 import read_pickle_from_s3 CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" @@ -2608,7 +2610,7 @@ def propsed_wave_3_sample(): len(list(set(units_in_bid))) -def identify_incorrect_pacakges(): +def identify_incorrect_packages(): """ Due to limitations in the data collected during survey, we have some properties that do not have suitable packages assigned. This function will identify those properties, which can be flagged for Stonewater's review @@ -2635,21 +2637,23 @@ def identify_incorrect_pacakges(): # Check the different heating types units_with_assigned_packages["Gas properties: different to Parity"] = ( - (units_with_assigned_packages["Heating Type"].isin(["Gas", "Communal Gas"])) & ( - units_with_assigned_packages["Heating"].isin( - [ - "Heat Pump: Electric Heat " - "pumps: Air source heat pump " - "with flow temperature <= 35°C", - "Electric Storage Systems: Fan " - "storage heaters", - "Electric (direct acting) room " - "heaters: Panel, convector or " - "radiant heaters" - ] + ( + units_with_assigned_packages["Heating Type"].isin(["Gas", "Communal Gas"]) + ) & ( + units_with_assigned_packages["Heating"].isin( + [ + "Heat Pump: Electric Heat " + "pumps: Air source heat pump " + "with flow temperature <= 35°C", + "Electric Storage Systems: Fan " + "storage heaters", + "Electric (direct acting) room " + "heaters: Panel, convector or " + "radiant heaters" + ] + ) ) ) - ) units_with_assigned_packages["Electric properties: different to Parity"] = ( (units_with_assigned_packages["Heating Type"] == "Electric") & ( @@ -2717,17 +2721,26 @@ def identify_incorrect_pacakges(): # We now iterate through postcodes and find anomalous properties based on the partiy data and survey data fields_to_check = [ - 'Wall Type', 'Roof Type', 'Heating', 'Main Fuel', + 'Wall Type Category', + # 'Roof Type Category', - not very interesting + 'Heating', + 'Main Fuel', 'Survey: Main Wall Type', - 'Survey: Main Roof Type', 'Survey: Primary Heating System' + # 'Survey: Main Roof Type', + 'Survey: Primary Heating System' ] - # Create an empty dictionary to store results - aggregated_results = {} - units_with_assigned_packages['Wall Type'] = units_with_assigned_packages['Wall Type'].str.replace( + units_with_assigned_packages['Wall Type Category'] = units_with_assigned_packages['Wall Type'].str.replace( r'\s*\(.*?\)', '', regex=True ) + # Create roof type category by splitting in colon and taking the first part + units_with_assigned_packages['Roof Type Category'] = units_with_assigned_packages['Roof Type'].str.split(':').str[0] + + units_with_assigned_packages["Street, Region and Postcode"] = ( + units_with_assigned_packages["Street and Region"] + ", " + units_with_assigned_packages["Postcode"] + ) + def check_mixed_types(row): # Count distinct primary types with non-zero values primary_types_present = set() @@ -2738,11 +2751,11 @@ def identify_incorrect_pacakges(): primary_types_present.add(primary_type) return len(primary_types_present) > 1 # True if more than one primary type - # Process each field + aggregated_results = {} for field in fields_to_check: # Group by postcode and count occurrences of each unique value field_counts = ( - units_with_assigned_packages.groupby(['Postcode', field]) + units_with_assigned_packages.groupby(['Street, Region and Postcode', field]) .size() .unstack(fill_value=0) .reset_index() @@ -2764,5 +2777,132 @@ def identify_incorrect_pacakges(): # Store the result in the dictionary aggregated_results[field] = field_counts + # Let's fetch the EPC data + # Read in the existing EPC data we stored + import json + from utils.s3 import read_from_s3, read_pickle_from_s3 + def read_epc_data(): + epc_data = json.loads( + read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name="customers/Stonewater/clustering/epc_data.json" + ) + ) + epc_data = pd.DataFrame(epc_data) + + epc_data["uprn"] = np.where( + epc_data["internal_id"] == 1091, + 83143766, + epc_data["uprn"] + ) + epc_data_batch_2 = read_pickle_from_s3( + s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl", + bucket_name="retrofit-data-dev" + ) + epc_data_batch_2 = pd.DataFrame(epc_data_batch_2) + + complete_epcs = pd.concat([epc_data, epc_data_batch_2]) + + return complete_epcs + + epc_data = read_epc_data() + # Get just the fields we want from the EPC: Uprn, Wall, Roof, Heating, Fuel, SAP Score, EPC Band, Date of EPC + epc_data_to_append = epc_data[ + [ + "uprn", "walls-description", "roof-description", "mainheat-description", "main-fuel", + "current-energy-efficiency", "current-energy-rating", "lodgement-date", + "estimated" + ] + ].rename( + columns={ + "uprn": "UPRN", + "walls-description": "EPC: Wall Type", + "roof-description": "EPC: Roof Type", + "mainheat-description": "EPC: Heating", + "mainfuel": "EPC: Main Fuel", + "current-energy-efficiency": "EPC: SAP Score", + "current-energy-rating": "EPC: EPC Band", + "lodgement-date": "EPC: Date of EPC", + "estimated": "EPC Estimated based on Nearby Properties" + } + ) + # Find entries where the SAP score is not an integer + non_integer_sap = epc_data_to_append[~epc_data_to_append["EPC: SAP Score"].astype(str).str.isnumeric()] + non_integer_sap["UPRN"].values[0] + + epc_data_to_append["EPC: Date of EPC"] = pd.to_datetime(epc_data_to_append["EPC: Date of EPC"]) + # Years since the EPC was lodged + epc_data_to_append["Years since EPC"] = (pd.Timestamp.now() - epc_data_to_append["EPC: Date of EPC"]).dt.days / 365 + epc_data_to_append = epc_data_to_append[epc_data_to_append["UPRN"] != ""] + epc_data_to_append["UPRN"] = epc_data_to_append["UPRN"].astype(int) + + units_with_assigned_packages = units_with_assigned_packages.merge( + epc_data_to_append, how="left", on="UPRN", + ) + + # Read in the wave 2.1 data + wave_2_data = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, "Stonewater 2.1 SAP Pre & Post.xlsx" + ), + header=3 + ) + # Remove any where the work is outstanding + wave_2_data = wave_2_data[wave_2_data["Retrofit Assessment"] == "Completed"] + wave_2_data = wave_2_data[~pd.isnull(wave_2_data["Package Approved (Client)"])] + wave_2_data["house_number"] = wave_2_data["Name"].apply(lambda x: SearchEpc.get_house_number(x, "")) + + # Filter postcodes in the units_with_assigned_packages, to find overlapping postcodes + related_to_wave_2 = units_with_assigned_packages[ + units_with_assigned_packages["Postcode"].isin( + wave_2_data["Post Code"].values + ) & ( + ~units_with_assigned_packages["Confidence Tier"].isin( + [ + "1 - same archetype, same postal region", "1 - property was surveyed" + ] + ) + ) + ] + + wave2_matches = [] + for _, home in related_to_wave_2.iterrows(): + # Get the related homes + assigned_wave_2_packages = wave_2_data[ + wave_2_data["Post Code"] == home["Postcode"] + ] + + if assigned_wave_2_packages.shape[0] != 1: + # In this case, we get the closest match based on door number + hn = SearchEpc.get_house_number(home["Name"], home["Postcode"]) + + assigned_wave_2_packages = assigned_wave_2_packages[ + abs(assigned_wave_2_packages["house_number"].astype(int) - int(hn)) == min( + abs(assigned_wave_2_packages["house_number"].astype(int) - int(hn))) + ] + + wave2_matches.append( + { + "UPRN": home["UPRN"], + "2.1 matched address": assigned_wave_2_packages["Name"].values[0], + "2.1 matched address: Package Ref": assigned_wave_2_packages["Package Approved (Client)"].values[0], + "2.1 matched address: Wall Insulation": assigned_wave_2_packages["Wall Insulation"].values[0], + "2.1 matched address: Loft Insulation": assigned_wave_2_packages["Loft Insulation"].values[0], + "2.1 matched address: Ventilation": assigned_wave_2_packages["Ventilation"].values[0], + "2.1 matched address: Windows": assigned_wave_2_packages["Windwos Upgrade"].values[0] + } + ) + + # Store each results to CSV + for field, df in aggregated_results.items(): + df.to_csv( + os.path.join(CUSTOMER_FOLDER_PATH, f"{field} - aggregated results.csv"), index=False + ) + + # Store units_with_assigned_packages + units_with_assigned_packages.to_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Units with assigned packages - with flags.csv"), index=False + ) + # if __name__ == "__main__": # main()