From 711db3f552e958128faeb49a22073e5461dbc4f6 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Thu, 13 Feb 2025 07:59:12 +0000 Subject: [PATCH] adding v1 extraction to stonewater --- .../stonewater/Wave 3 Preparation.py | 53 +++++++++++++++++-- 1 file changed, 48 insertions(+), 5 deletions(-) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index e471211c..12158671 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -182,7 +182,10 @@ def extract_summary_report(pdf_path): data["Current SAP Rating"] = sap_match.group(1).split(" ")[1] # Extract age - age_band_match = re.search(r"3\.0 Date Built:\s*Main Property\s*[A-Z]?\s*(\d{4}-\d{4})", text) + age_band_match = re.search( + r"3\.0 Date Built:\s*Main Property\s*[A-Z]?\s*(\d{4}-\d{4}|before \d{4}|\d{4} onwards)", + text + ) data["Main Building Age Band"] = age_band_match.group(1) # Number of storeys @@ -786,7 +789,11 @@ def extract_epr(pdf_path): data["Primary Energy Use Intensity (kWh/m2/yr)"] = data["Primary Energy Use (kWh/yr)"] / int(floor_area) # Extract age band - age_band_match = re.search(r"Building part:\s*Main\s*-\s*built in\s*(?:[A-Z]\s*)?(\d{4}-\d{4})", text) + age_band_match = re.search( + r"Building part:\s*Main\s*-\s*built in\s*(?:[A-Z]\s*)?(\d{4}-\d{4}|before \d{4}|\d{4} onwards)", + text + ) + data["Main Building Age Band"] = age_band_match.group(1) # Extract Number of Storeys @@ -3065,8 +3072,21 @@ def revised_model(): os.path.join(mtp_folder, file) for file in os.listdir(mtp_folder_path) if ".DS_Store" not in file and not os.path.isdir(os.path.join(mtp_folder_path, mtp_folder, file)) ] + + has_v1 = [ + f for f in mtp_contents if "v1" in f.lower() or "/ss" in f.lower() + ] + + if has_v1: + # Then we go one level deeper + mtp_contents = [ + os.path.join(has_v1[0], f) for f in + os.listdir(os.path.join(survey_folder_path, has_v1[0])) + ] + # We check the the IMA for file_name in mtp_contents: + filepath = os.path.join(survey_folder_path, file_name) # We expect a pdf so try and parse it try: @@ -3092,6 +3112,12 @@ def revised_model(): has_pv = bool(pv_search) pv_system = pv_search.group(0) if has_pv else None + # We perform a second search for PV: + if pv_search is None: + pv_search = re.search("solar pv", text.lower()) + has_pv = bool(pv_search) + pv_system = "Solar PV" if has_pv else None + rir_search = re.search(r"RIR \(\d+(\.\d+)?\)", text) has_rir = bool(rir_search) rir_spec = rir_search.group(0) if has_rir else None @@ -3149,12 +3175,20 @@ def revised_model(): extracted_data.append(summary_data) retrofit_assessment_data = pd.DataFrame(extracted_data) + mtp_df = pd.DataFrame(mtp_extracted_data) + # Save # retrofit_assessment_data.to_csv( # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"), index=False # ) + # mtp_df.to_csv( + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 3.csv"), index=False + # ) retrofit_assessment_data = pd.read_csv( - os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 2.csv"), + os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 3.csv"), + ) + mtp_df = pd.read_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 3.csv"), ) # Remove some definite duplicates @@ -3164,6 +3198,9 @@ def revised_model(): # Get all of the folders that end with ROSS to_drop = dupes[dupes["survey_folder"].str.endswith("ROSS")]["survey_folder"].unique().tolist() + # Replace \n with "" + retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "") + retrofit_assessment_data = retrofit_assessment_data[ ~retrofit_assessment_data["survey_folder"].isin( [ @@ -3173,8 +3210,6 @@ def revised_model(): ] + to_drop ) ] - # Replace \n with "" - retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "") retrofit_assessments_data_columns = [ 'Current SAP Rating', 'Current EPC Band', 'Primary Energy Use (kWh/yr)', @@ -3685,9 +3720,17 @@ def revised_model(): if not missed_asset_id.empty: raise Exception("Missing Asset ID") + # We merge the mpt data on to the wates coordination + wates_coordination = wates_coordination.merge( + mtp_df, how="left", on="survey_folder" + ) + ccs_coordination = ccs_coordination.merge( ccs_matching_lookup, how="left", on="Name" ) + ccs_coordination = ccs_coordination.merge( + mtp_df, how="left", on="survey_folder" + ) retrofit_packages_board = retrofit_packages_board.merge( matching_lookup, how="left", on="Name"