diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 24a8e9bb..e471211c 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -125,13 +125,13 @@ def extract_summary_report(pdf_path): - Address """ - blah data = { "Address": None, "Postcode": None, "Current SAP Rating": None, "Current EPC Band": None, "Fuel Bill": None, + "Main Building Age Band": None, "Number of Storeys": None, "Window Age Description": None, "Window Age Description Proportion (%)": None, @@ -181,6 +181,10 @@ def extract_summary_report(pdf_path): sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) data["Current SAP Rating"] = sap_match.group(1).split(" ")[1] + # Extract age + age_band_match = re.search(r"3\.0 Date Built:\s*Main Property\s*[A-Z]?\s*(\d{4}-\d{4})", text) + data["Main Building Age Band"] = age_band_match.group(1) + # Number of storeys storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) data["Number of Storeys"] = int(storeys_match.group(1)) @@ -3027,6 +3031,7 @@ def revised_model(): # We now do a large pull of all of the data extracted_data = [] + mtp_extracted_data = [] # Additional data to extract from the medium term plans for survey_folder in tqdm(survey_folders): survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) @@ -3048,6 +3053,58 @@ def revised_model(): None ) + mtp_folder = next( + (name for name in survey_subfolders if "mid-term" in name.lower() or "mtp" in name.lower()), + None + ) + if mtp_folder: + # We have a mid term plan: + mtp_folder_path = os.path.join(survey_folder_path, mtp_folder) + # Get the contents - files and not folder + mtp_contents = [ + os.path.join(mtp_folder, file) for file in os.listdir(mtp_folder_path) + if ".DS_Store" not in file and not os.path.isdir(os.path.join(mtp_folder_path, mtp_folder, file)) + ] + # We check the the IMA + for file_name in mtp_contents: + filepath = os.path.join(survey_folder_path, file_name) + # We expect a pdf so try and parse it + try: + with open(filepath, "rb") as file: + reader = PyPDF2.PdfReader(file) + # Just the first page + text = reader.pages[0].extract_text() + + except Exception as e: + continue + + # We check if this is an IMA + ima_heading_search = re.search( + r"Improvement measure\s+Capital Cost\s+Lifetime of\s*\n\s*measureFuel saving\s*Lifetime fuel", text + ) + + is_ima = bool(ima_heading_search) + if not is_ima: + continue + + # Otherwise, extract: RIR, PV + pv_search = re.search(r"PV \(\d+Kwp\)", text) + has_pv = bool(pv_search) + pv_system = pv_search.group(0) if has_pv else None + + rir_search = re.search(r"RIR \(\d+(\.\d+)?\)", text) + has_rir = bool(rir_search) + rir_spec = rir_search.group(0) if has_rir else None + + mtp_extracted_data.append({ + "survey_folder": survey_folder, + "has_pv": has_pv, + "PV System": pv_system, + "RIR Specification": rir_spec, + "has_rir": has_rir + }) + continue + # If retrofit assessment folder exists, check if it has content if retrofit_folder or ra_folder: if retrofit_folder: @@ -3094,7 +3151,7 @@ def revised_model(): retrofit_assessment_data = pd.DataFrame(extracted_data) # retrofit_assessment_data.to_csv( - # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 2.csv"), index=False + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"), index=False # ) retrofit_assessment_data = pd.read_csv( os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 2.csv"),