diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 105628e9..ee314f17 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -747,12 +747,30 @@ def extract_epr(pdf_path): # Extract Current and Potential SAP ratings sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text) - current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2)) - data["Current SAP Rating"] = current_sap + if sap_match is None: + # Handles the older format of the elmhurst EPR + # The text will look something like this: + # Least energy efficient - higher running costsD 61 - we extract D 61 + sap_match = re.search( + r"(?P[A-G])\s(?P\d{1,3})(?P[A-G])\s(?P\d{1,3})", + text) + data["Current EPC Band"] = sap_match.group("current_epc") + data["Current SAP Rating"] = int(sap_match.group("current_sap")) + else: + current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2)) + data["Current SAP Rating"] = current_sap # Extract the primary energy use intensity additional_rating_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text) - data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1)) + if additional_rating_match: + data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1)) + else: + # Handles the older format of the Elmhurst EPR + primary_energy_match = re.search(r"actual consumption\.\n(?P\d+)", text) + data["Primary Energy Use (kWh/yr)"] = int(primary_energy_match.group("primary_energy")) + # We calculate the primary energy use intensity by dividing by floor area + floor_area = re.search(r"Total Floor Area\s(?P\d+)\s?m2", text).group("floor_area") + data["Primary Energy Use Intensity (kWh/m2/yr)"] = data["Primary Energy Use (kWh/yr)"] / int(floor_area) # Extract Number of Storeys storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) @@ -2983,8 +3001,13 @@ def revised_model(): # We now do a large pull of all of the data extracted_data = [] for survey_folder in tqdm(survey_folders): + survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) + # Check that the survey folder is actually a folder + if not os.path.isdir(survey_folder_path): + continue + # List the folders inside of the survey folder survey_subfolders = [ name for name in os.listdir(survey_folder_path) diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 8d19aa84..247ce98c 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -162,19 +162,17 @@ def app(): Property UPRN """ - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern" - DATA_FILENAME = "January 2025 Additions Query.xlsx" - SHEET_NAME = "Jan 2025 additions" + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/For Housing" + DATA_FILENAME = "For Housing Data pull.xlsx" + SHEET_NAME = "Sheet1" POSTCODE_COLUMN = "Post Code" - FULLADDRESS_COLUMN = "Street / Block Name" - ADDRESS1_COLUMN = None - ADDRESS1_METHOD = "first_word" - ADDRESS_COLS_TO_CONCAT = [] + FULLADDRESS_COLUMN = None + ADDRESS1_COLUMN = "NO." + ADDRESS1_METHOD = None + ADDRESS_COLS_TO_CONCAT = ["NO.", "Street / Block Name"] # Maps addresses to uprn in problematic cases - MANUAL_UPRN_MAP = { - "Ardelagh Ardelagh Faris Lane Woodham Addlestone KT15 3DJ": 100061484560 - } + MANUAL_UPRN_MAP = {} asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index()