debugging extract epr for old elmhurst epr

This commit is contained in:
Khalim Conn-Kowlessar 2025-01-28 22:15:53 +00:00
parent 86deed8115
commit ca7a0e9d10
2 changed files with 34 additions and 13 deletions

View file

@ -747,12 +747,30 @@ def extract_epr(pdf_path):
# Extract Current and Potential SAP ratings
sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text)
current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2))
data["Current SAP Rating"] = current_sap
if sap_match is None:
# Handles the older format of the elmhurst EPR
# The text will look something like this:
# Least energy efficient - higher running costsD 61 - we extract D 61
sap_match = re.search(
r"(?P<current_epc>[A-G])\s(?P<current_sap>\d{1,3})(?P<potential_epc>[A-G])\s(?P<potential_sap>\d{1,3})",
text)
data["Current EPC Band"] = sap_match.group("current_epc")
data["Current SAP Rating"] = int(sap_match.group("current_sap"))
else:
current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2))
data["Current SAP Rating"] = current_sap
# Extract the primary energy use intensity
additional_rating_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text)
data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1))
if additional_rating_match:
data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1))
else:
# Handles the older format of the Elmhurst EPR
primary_energy_match = re.search(r"actual consumption\.\n(?P<primary_energy>\d+)", text)
data["Primary Energy Use (kWh/yr)"] = int(primary_energy_match.group("primary_energy"))
# We calculate the primary energy use intensity by dividing by floor area
floor_area = re.search(r"Total Floor Area\s(?P<floor_area>\d+)\s?m2", text).group("floor_area")
data["Primary Energy Use Intensity (kWh/m2/yr)"] = data["Primary Energy Use (kWh/yr)"] / int(floor_area)
# Extract Number of Storeys
storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
@ -2983,8 +3001,13 @@ def revised_model():
# We now do a large pull of all of the data
extracted_data = []
for survey_folder in tqdm(survey_folders):
survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder)
# Check that the survey folder is actually a folder
if not os.path.isdir(survey_folder_path):
continue
# List the folders inside of the survey folder
survey_subfolders = [
name for name in os.listdir(survey_folder_path)

View file

@ -162,19 +162,17 @@ def app():
Property UPRN
"""
DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern"
DATA_FILENAME = "January 2025 Additions Query.xlsx"
SHEET_NAME = "Jan 2025 additions"
DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/For Housing"
DATA_FILENAME = "For Housing Data pull.xlsx"
SHEET_NAME = "Sheet1"
POSTCODE_COLUMN = "Post Code"
FULLADDRESS_COLUMN = "Street / Block Name"
ADDRESS1_COLUMN = None
ADDRESS1_METHOD = "first_word"
ADDRESS_COLS_TO_CONCAT = []
FULLADDRESS_COLUMN = None
ADDRESS1_COLUMN = "NO."
ADDRESS1_METHOD = None
ADDRESS_COLS_TO_CONCAT = ["NO.", "Street / Block Name"]
# Maps addresses to uprn in problematic cases
MANUAL_UPRN_MAP = {
"Ardelagh Ardelagh Faris Lane Woodham Addlestone KT15 3DJ": 100061484560
}
MANUAL_UPRN_MAP = {}
asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)
asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index()