mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
779 lines
30 KiB
Python
779 lines
30 KiB
Python
import os
|
||
import time
|
||
from BaseUtility import Definitions
|
||
import pandas as pd
|
||
import numpy as np
|
||
from tqdm import tqdm
|
||
from datetime import datetime
|
||
|
||
from dotenv import load_dotenv
|
||
from backend.SearchEpc import SearchEpc
|
||
from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
|
||
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
|
||
|
||
from recommendations.recommendation_utils import (
|
||
estimate_perimeter,
|
||
estimate_external_wall_area,
|
||
estimate_number_of_floors
|
||
)
|
||
|
||
from etl.epc_clean.epc_attributes.attribute_utils import (
|
||
extract_thermal_transmittance
|
||
)
|
||
|
||
load_dotenv(dotenv_path="backend/.env")
|
||
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
||
|
||
|
||
def get_data(
|
||
asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, uprn_column=None,
|
||
epc_api_only=False
|
||
):
|
||
epc_data = []
|
||
errors = []
|
||
no_epc = []
|
||
for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
|
||
try:
|
||
postcode = home[postcode_column]
|
||
house_number = str(home[address1_column]).strip()
|
||
full_address = home[fulladdress_column].strip()
|
||
house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode)
|
||
if house_no is None:
|
||
house_no = house_number
|
||
uprn = manual_uprn_map.get(full_address, None)
|
||
if uprn is None and home.get(uprn_column):
|
||
uprn = home[uprn_column]
|
||
|
||
if pd.isnull(uprn):
|
||
uprn = None
|
||
|
||
searcher = SearchEpc(
|
||
address1=str(house_no),
|
||
postcode=postcode,
|
||
auth_token=EPC_AUTH_TOKEN,
|
||
os_api_key="",
|
||
property_type=None,
|
||
fast=True,
|
||
full_address=full_address,
|
||
max_retries=5,
|
||
uprn=uprn
|
||
)
|
||
# Force the skipping of estimating the EPC
|
||
searcher.ordnance_survey_client.property_type = None
|
||
searcher.ordnance_survey_client.built_form = None
|
||
|
||
searcher.find_property(skip_os=True)
|
||
|
||
# Check if we have a flat or appartment
|
||
if searcher.newest_epc is None and uprn is None:
|
||
# Try again:
|
||
if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None:
|
||
# Backup
|
||
add1 = full_address.split(",")
|
||
if len(add1) > 1:
|
||
add1 = add1[1].strip()
|
||
else:
|
||
# Try splitting on space
|
||
add1 = full_address.split(" ")[0].strip()
|
||
|
||
else:
|
||
add1 = str(house_number)
|
||
searcher = SearchEpc(
|
||
address1=add1,
|
||
postcode=postcode,
|
||
auth_token=EPC_AUTH_TOKEN,
|
||
os_api_key="",
|
||
property_type=None,
|
||
fast=True,
|
||
full_address=full_address,
|
||
max_retries=5
|
||
)
|
||
|
||
if (
|
||
"flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in
|
||
house_number.lower()
|
||
):
|
||
searcher.ordnance_survey_client.property_type = "Flat"
|
||
|
||
searcher.find_property(skip_os=True)
|
||
|
||
if searcher.newest_epc is None:
|
||
no_epc.append(home["row_id"])
|
||
continue
|
||
|
||
if epc_api_only:
|
||
epc = {
|
||
"row_id": home["row_id"],
|
||
**searcher.newest_epc.copy()
|
||
}
|
||
|
||
epc_data.append(epc)
|
||
continue
|
||
|
||
# Look for EPC recommendatons
|
||
try:
|
||
property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
|
||
except:
|
||
property_recommendations = {"rows": []}
|
||
|
||
# Retrieve data from FindMyEPC
|
||
try:
|
||
find_epc_searcher = RetrieveFindMyEpc(
|
||
address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"]
|
||
)
|
||
find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
|
||
except ValueError as e:
|
||
if "No EPC found" in str(e) and "address1" in searcher.newest_epc:
|
||
try:
|
||
find_epc_searcher = RetrieveFindMyEpc(
|
||
address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"]
|
||
)
|
||
find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
|
||
except ValueError as e:
|
||
if "No EPC found" in str(e):
|
||
find_epc_data = {}
|
||
else:
|
||
find_epc_data = {}
|
||
except Exception as e:
|
||
raise Exception(f"Error retrieving FindMyEPC data: {e}")
|
||
time.sleep(np.random.uniform(0.1, 1))
|
||
|
||
epc = {
|
||
"row_id": home["row_id"],
|
||
**searcher.newest_epc.copy(),
|
||
"recommendations": property_recommendations["rows"],
|
||
"find_my_epc_data": find_epc_data,
|
||
}
|
||
|
||
epc_data.append(epc)
|
||
except Exception as e:
|
||
errors.append(home["row_id"])
|
||
time.sleep(5)
|
||
|
||
return epc_data, errors, no_epc
|
||
|
||
|
||
def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"):
|
||
if method == "first_two_words":
|
||
asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
|
||
return asset_list
|
||
|
||
if method == "first_word":
|
||
asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0]
|
||
return asset_list
|
||
|
||
if method == "house_number_extraction":
|
||
asset_list["address1_extracted"] = asset_list.apply(
|
||
lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]),
|
||
axis=1
|
||
)
|
||
return asset_list
|
||
|
||
raise ValueError(f"Method {method} not recognized")
|
||
|
||
|
||
def process_age_band(x, year_built_column):
|
||
if isinstance(x[year_built_column], datetime):
|
||
year_built = x[year_built_column].year
|
||
else:
|
||
year_built = float(x[year_built_column])
|
||
|
||
if pd.isnull(x["Property Age Band"]) or (
|
||
x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES
|
||
) or pd.isnull(year_built):
|
||
return "No EPC Age Band"
|
||
|
||
# We check if we have a numeric data
|
||
if x["Property Age Band"].isdigit():
|
||
if year_built == float(x["Property Age Band"]):
|
||
return "EPC Age Band Matches Year Built"
|
||
if year_built > float(x["Property Age Band"]):
|
||
return "EPC Age Band is older than Year Built"
|
||
if year_built < float(x["Property Age Band"]):
|
||
return "EPC Age Band is newer than Year Built"
|
||
|
||
# Handle specific case
|
||
if x["Property Age Band"] == "England and Wales: 2007 onwards":
|
||
if year_built >= 2007:
|
||
return "EPC Age Band Matches Year Built"
|
||
if year_built < 2007:
|
||
return "EPC Age Band is older than Year Built"
|
||
|
||
if x["Property Age Band"] == "England and Wales: 2012 onwards":
|
||
if year_built >= 2012:
|
||
return "EPC Age Band Matches Year Built"
|
||
if year_built < 2012:
|
||
return "EPC Age Band is older than Year Built"
|
||
|
||
if x["Property Age Band"] == "England and Wales: before 1900":
|
||
if year_built < 1900:
|
||
return "EPC Age Band Matches Year Built"
|
||
if year_built >= 1900:
|
||
return "EPC Age Band is newer than Year Built"
|
||
|
||
# Age band will be formatted as such:
|
||
# 'England and Wales: {upper date}-{lower date}'
|
||
# so we extract the lower and upper date
|
||
age_band = x["Property Age Band"].split(": ")[1]
|
||
lower_date, upper_date = age_band.split("-")
|
||
if year_built <= float(upper_date) and year_built >= float(lower_date):
|
||
return "EPC Age Band Matches Year Built"
|
||
|
||
if year_built > float(upper_date):
|
||
return "EPC Age Band is older than Year Built"
|
||
|
||
if year_built < float(upper_date):
|
||
return "EPC Age Band is newer than Year Built"
|
||
|
||
raise Exception("Should not reach here")
|
||
|
||
|
||
def app():
|
||
"""
|
||
This app is EPC pulling data for some properties owned by Livewest
|
||
|
||
Data request contents:
|
||
Date of last EPC
|
||
Reason for EPC
|
||
SAP score on register
|
||
Property Type
|
||
Property Area
|
||
Property Age
|
||
Any Dimensions (HLP,PW,RH)
|
||
Property Wall Construction
|
||
Heating Type
|
||
Secondary Heating
|
||
Loft Insulation Depth
|
||
|
||
Additional if possible:
|
||
Heat loss calculations
|
||
EPC recommendations
|
||
Property UPRN
|
||
"""
|
||
|
||
# TODO:
|
||
# For cavity work:
|
||
# - Flag any entries that have a different wall type between non-intrusive data against EPC
|
||
# - Worth double checking entries that have a difference in wall construction
|
||
# - Look at anything that is flagged as an empty cavity but the EPC data says it’s a filled cavity
|
||
# - Look at the current EPC scores - Anything that is C75 or above, especially if it’s assumed no insulation
|
||
# - By postcode, we can try and deduce if all of the addresses are a flats and then estimate if 50% of the flats
|
||
# are less than C75
|
||
# - Flag anything pre SAP2012
|
||
# - Flag anything over 5 years old
|
||
# - Look at year built vs age band
|
||
#
|
||
# For Solar:
|
||
# - Discount any that have solar PV - based on non-intrusives and from the inspections team
|
||
# - In the heating, discount anything that isn’t ashp, ghsp, hhrs, electric storage - possibly homes with
|
||
# electric room heaters but it might need to be an EPC E
|
||
# - Fabric - check the floor, wall and roof:
|
||
# - Filled or empty cavity is good
|
||
# - Insulated solid/timber/system built is good
|
||
# - SCIS/CEG needs solid floors
|
||
# - JJC don’t care
|
||
# - Anything with a loft 200 or below
|
||
# - Anything C75 and above won’t qualify
|
||
# - Insulated loft = 200mm
|
||
# - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
|
||
# - Or the insulation required is loft/cavity (floors should be solid)
|
||
|
||
# For Westward
|
||
DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward"
|
||
DATA_FILENAME = "WESTWARD - completed list..xlsx"
|
||
SHEET_NAME = "Sheet1"
|
||
POSTCODE_COLUMN = "WFT EDIT Postcode"
|
||
FULLADDRESS_COLUMN = "Address"
|
||
ADDRESS1_COLUMN = None
|
||
ADDRESS1_METHOD = "house_number_extraction"
|
||
ADDRESS_COLS_TO_CONCAT = []
|
||
MISSING_POSTCODES_METHOD = None
|
||
PROPERTY_YEAR_BUILT = "Build date"
|
||
UPRN_COLUMN = "UPRN"
|
||
# If we have the non-intrusives data, this should be true
|
||
HAS_NON_INTRUSIVES = True
|
||
|
||
# DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
|
||
# DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
|
||
# SHEET_NAME = "Sheet1"
|
||
# POSTCODE_COLUMN = 'Full Address.1'
|
||
# FULLADDRESS_COLUMN = "Full Address"
|
||
# ADDRESS1_COLUMN = None
|
||
# ADDRESS1_METHOD = "first_word"
|
||
# ADDRESS_COLS_TO_CONCAT = []
|
||
# MISSING_POSTCODES_METHOD = None
|
||
# PROPERTY_YEAR_BUILT = "Build Date"
|
||
# UPRN_COLUMN = None
|
||
# # If we have the non-intrusives data, this should be true
|
||
# HAS_NON_INTRUSIVES = True
|
||
|
||
# Maps addresses to uprn in problematic cases
|
||
MANUAL_UPRN_MAP = {}
|
||
|
||
asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)
|
||
|
||
if MISSING_POSTCODES_METHOD is not None:
|
||
if MISSING_POSTCODES_METHOD == "last_two_words":
|
||
# Replace any double spaces
|
||
asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace(' ', ' ', regex=False)
|
||
asset_list["Postcode"] = np.where(
|
||
pd.isnull(asset_list["Postcode"]),
|
||
asset_list[FULLADDRESS_COLUMN].str.split(" ").str[-2:].str.join(" "),
|
||
asset_list["Postcode"]
|
||
)
|
||
else:
|
||
raise ValueError(f"Method {MISSING_POSTCODES_METHOD} not recognized")
|
||
|
||
asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index()
|
||
asset_list["row_id"] = asset_list.index
|
||
|
||
# We clean up portential non-breaking spaces, and double spaces
|
||
for col in [c for c in [POSTCODE_COLUMN, FULLADDRESS_COLUMN, ADDRESS1_COLUMN] if c is not None]:
|
||
asset_list[col] = asset_list[col].astype(str)
|
||
asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False)
|
||
asset_list[col] = asset_list[col].str.replace(' ', ' ', regex=False)
|
||
asset_list[col] = asset_list[col].str.strip()
|
||
|
||
if ADDRESS1_COLUMN is None:
|
||
ADDRESS1_COLUMN = "address1_extracted"
|
||
asset_list = extract_address1(
|
||
asset_list=asset_list,
|
||
full_address_col=FULLADDRESS_COLUMN,
|
||
postcode_col=POSTCODE_COLUMN,
|
||
method=ADDRESS1_METHOD
|
||
)
|
||
|
||
if FULLADDRESS_COLUMN is None:
|
||
FULLADDRESS_COLUMN = "fulladdress_extracted"
|
||
# We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas
|
||
# Sometimes, some of the columns are empty, so we need to remove them
|
||
asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(
|
||
lambda x: ", ".join([y for y in x if not pd.isnull(y)]), axis=1
|
||
)
|
||
|
||
# We clean up portential non-breaking spaces, and double spaces
|
||
asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].astype(str)
|
||
asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace('\xa0', ' ', regex=False)
|
||
asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace(' ', ' ', regex=False)
|
||
|
||
if UPRN_COLUMN is not None:
|
||
# Check if it's numeric and if so, make sure it's an integer
|
||
def convert_uprn(x):
|
||
|
||
if pd.isnull(x):
|
||
return x
|
||
|
||
# check if numeric
|
||
if np.isreal(x):
|
||
return str(int(x))
|
||
|
||
if str(x).isdigit():
|
||
return str(int(x))
|
||
return x
|
||
|
||
asset_list[UPRN_COLUMN] = asset_list[UPRN_COLUMN].apply(convert_uprn)
|
||
|
||
# We attempt to process the year built column
|
||
if PROPERTY_YEAR_BUILT is not None:
|
||
# We check if we have a datetime
|
||
if isinstance(asset_list[PROPERTY_YEAR_BUILT].iloc[0], datetime):
|
||
# We treat any string columns - with common values we see
|
||
datetime_remap = {
|
||
"Pre 1900": datetime(year=1899, month=12, day=31),
|
||
}
|
||
asset_list[PROPERTY_YEAR_BUILT] = asset_list[PROPERTY_YEAR_BUILT].replace(datetime_remap)
|
||
|
||
asset_list[PROPERTY_YEAR_BUILT] = pd.to_datetime(asset_list[PROPERTY_YEAR_BUILT])
|
||
# Convert this to year
|
||
asset_list[PROPERTY_YEAR_BUILT] = asset_list[PROPERTY_YEAR_BUILT].dt.year
|
||
|
||
# We check for duplicated addresses
|
||
asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN]
|
||
if asset_list["deduper"].duplicated().sum():
|
||
# Drop the dupes
|
||
print(f"There are {asset_list['deduper'].duplicated().sum()} duplicated addresses - dropping")
|
||
asset_list = asset_list[~asset_list["deduper"].duplicated()]
|
||
asset_list = asset_list.drop(columns=["deduper"])
|
||
|
||
# We chunk up this data into 5000 rows at a time
|
||
# Create the chunks directory
|
||
if not os.path.exists(os.path.join(DATA_FOLDER, "Chunks")):
|
||
os.makedirs(os.path.join(DATA_FOLDER, "Chunks"))
|
||
chunk_size = 5000
|
||
errors = []
|
||
no_epc = []
|
||
skip = None # Used to skip already completed chunks
|
||
for i in range(0, len(asset_list), chunk_size):
|
||
print(f"Processing chunk {i} to {i + chunk_size}")
|
||
if skip is not None:
|
||
if i <= skip:
|
||
continue
|
||
chunk = asset_list[i:i + chunk_size]
|
||
epc_data_chunk, errors_chunk, no_epc_chunk = get_data(
|
||
asset_list=chunk,
|
||
fulladdress_column=FULLADDRESS_COLUMN,
|
||
address1_column=ADDRESS1_COLUMN,
|
||
postcode_column=POSTCODE_COLUMN,
|
||
manual_uprn_map=MANUAL_UPRN_MAP,
|
||
uprn_column=UPRN_COLUMN
|
||
)
|
||
|
||
# We now retrieve any failed properties
|
||
chunk_failed = chunk[chunk["row_id"].isin(errors)]
|
||
epc_data_failed, _, _ = get_data(
|
||
asset_list=chunk_failed,
|
||
fulladdress_column=FULLADDRESS_COLUMN,
|
||
address1_column=ADDRESS1_COLUMN,
|
||
postcode_column=POSTCODE_COLUMN,
|
||
manual_uprn_map=MANUAL_UPRN_MAP,
|
||
epc_api_only=False
|
||
)
|
||
|
||
epc_data_chunk.extend(epc_data_failed)
|
||
errors.extend(errors_chunk)
|
||
no_epc.extend(no_epc_chunk)
|
||
|
||
# Append the failed data to the main data
|
||
# Store the chunk locally as a csv
|
||
pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False)
|
||
|
||
# We read in and concatenate the created created chunks
|
||
chunks_folder = os.path.join(DATA_FOLDER, "Chunks")
|
||
# List the contents
|
||
chunk_files = os.listdir(chunks_folder)
|
||
epc_data = []
|
||
for file in chunk_files:
|
||
csv_data = pd.read_csv(os.path.join(chunks_folder, file))
|
||
# We need to convert the recommendations back to a list
|
||
csv_data["recommendations"] = csv_data["recommendations"].apply(eval)
|
||
csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval)
|
||
epc_data.append(csv_data)
|
||
|
||
epc_df = pd.concat(epc_data)
|
||
|
||
# We expand out the recommendations
|
||
recommendations_df = epc_df[["row_id", "recommendations"]]
|
||
|
||
unique_recommendations = set()
|
||
for _, row in recommendations_df.iterrows():
|
||
unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]])
|
||
|
||
columns = ["row_id"] + list(unique_recommendations)
|
||
transformed_data = []
|
||
for _, row in recommendations_df.iterrows():
|
||
# Initialize a dictionary for this row with False for all recommendations
|
||
row_data = {col: False for col in columns}
|
||
row_data["row_id"] = row["row_id"]
|
||
|
||
# Set True for each recommendation present in this row
|
||
for rec in row["recommendations"]:
|
||
recommendation_text = rec["improvement-summary-text"]
|
||
row_data[recommendation_text] = True
|
||
|
||
# Append the row data to transformed_data
|
||
transformed_data.append(row_data)
|
||
|
||
transformed_df = pd.DataFrame(transformed_data)
|
||
# At the moment, we're only using a limited set of columns - let's jut keep cavity wall insulation
|
||
# recommendations
|
||
transformed_df = transformed_df[["row_id", "Cavity wall insulation"]]
|
||
|
||
# Get the find my epc data
|
||
find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join(
|
||
pd.json_normalize(epc_df["find_my_epc_data"])
|
||
)
|
||
# We check if we get the solar pv column:
|
||
if "Solar photovoltaics" not in find_my_epc_data.columns:
|
||
find_my_epc_data["Solar photovoltaics"] = False
|
||
|
||
# Retrieve just the data we need
|
||
epc_df = epc_df[
|
||
[
|
||
"row_id",
|
||
"uprn",
|
||
"address1",
|
||
"address",
|
||
"postcode",
|
||
"property-type",
|
||
"built-form",
|
||
"inspection-date",
|
||
"current-energy-rating",
|
||
"current-energy-efficiency",
|
||
"roof-description",
|
||
"walls-description",
|
||
"floor-description",
|
||
"transaction-type",
|
||
# New fields needed
|
||
"secondheat-description",
|
||
"total-floor-area",
|
||
"construction-age-band",
|
||
"floor-height",
|
||
"number-habitable-rooms",
|
||
"mainheat-description",
|
||
#
|
||
"energy-consumption-current", # kwh/m2
|
||
"photo-supply",
|
||
]
|
||
].rename(
|
||
columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"}
|
||
)
|
||
|
||
asset_list = asset_list.merge(
|
||
epc_df,
|
||
how="left",
|
||
on="row_id"
|
||
).merge(
|
||
find_my_epc_data[
|
||
[
|
||
"row_id", "heating_text", "hot_water_text", 'Assessor’s name',
|
||
"Assessor's Telephone", "Assessor's Email", "Accreditation scheme",
|
||
"Assessor’s ID", "Solar photovoltaics"
|
||
]
|
||
].rename(
|
||
columns={
|
||
"Solar photovoltaics": "Has Solar PV",
|
||
"heating_text": "Heating Estimated kWh",
|
||
"hot_water_text": "Hot Water Estimated kWh",
|
||
}
|
||
),
|
||
how="left",
|
||
on="row_id"
|
||
)
|
||
|
||
asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""])
|
||
asset_list = asset_list.drop(columns=["photo-supply"])
|
||
|
||
# Rename the columns
|
||
asset_list = asset_list.rename(columns={
|
||
"inspection-date": "Date of last EPC",
|
||
"current-energy-efficiency": "SAP score on register",
|
||
"current-energy-rating": "EPC rating on register",
|
||
"property-type": "Property Type",
|
||
"built-form": "Archetype - EPC",
|
||
"total-floor-area": "Property Floor Area",
|
||
"construction-age-band": "Property Age Band",
|
||
"floor-height": "Property Floor Height",
|
||
"number-habitable-rooms": "Number of Habitable Rooms",
|
||
"walls-description": "Wall Construction",
|
||
"roof-description": "Roof Construction",
|
||
"floor-description": "Floor Construction",
|
||
"mainheat-description": "Heating Type",
|
||
"secondheat-description": "Secondary Heating",
|
||
"transaction-type": "Reason for last EPC",
|
||
"energy-consumption-current": "Heat Demand (kWh/m2)",
|
||
})
|
||
|
||
asset_list["Estimated Number of Floors"] = asset_list.apply(
|
||
lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull(
|
||
x["Property Type"]) else None, axis=1
|
||
)
|
||
|
||
asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float)
|
||
# Replace "" value with None
|
||
asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None)
|
||
asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float)
|
||
|
||
asset_list["Estimated Perimeter (m)"] = asset_list.apply(
|
||
lambda x: estimate_perimeter(
|
||
floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"],
|
||
num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"],
|
||
), axis=1
|
||
)
|
||
|
||
asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply(
|
||
lambda x: estimate_external_wall_area(
|
||
num_floors=x["Estimated Number of Floors"],
|
||
floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5,
|
||
perimeter=x["Estimated Perimeter (m)"],
|
||
built_form=x["Archetype - EPC"]
|
||
),
|
||
axis=1
|
||
)
|
||
|
||
asset_list["Roof Insulation Thickness"] = asset_list.apply(
|
||
lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull(
|
||
x["Roof Construction"]) else None,
|
||
axis=1
|
||
)
|
||
|
||
# We produce some additional fields
|
||
# 1) Is the SAP rating below C75
|
||
asset_list["SAP Rating is 75 and below"] = asset_list["SAP score on register"] <= 75
|
||
# 2) Flag anything where the EPC is older than 5 years
|
||
cutoff_year = pd.Timestamp.now().year - 5
|
||
asset_list[f"EPC is pre {cutoff_year}"] = (
|
||
pd.to_datetime(asset_list["Date of last EPC"]).dt.year < cutoff_year
|
||
)
|
||
|
||
# 3) If we have year in the asset list, we flag entries where the built year is different from the
|
||
# EPC Age band
|
||
if PROPERTY_YEAR_BUILT is not None:
|
||
asset_list["Does Age Match EPC Age Band?"] = asset_list.apply(
|
||
lambda x: process_age_band(x, PROPERTY_YEAR_BUILT), axis=1
|
||
)
|
||
|
||
if HAS_NON_INTRUSIVES:
|
||
# Empty cavity:
|
||
# 1) Has been flagged on the non-intrusives as being empty or partially filled
|
||
# 2) The age is before 1995
|
||
# 3) Remove anything that likley has access issues
|
||
asset_list["Suitable for Cavity Fill"] = (
|
||
(asset_list["Construction"] == "CAVITY") &
|
||
asset_list["Insulated"].isin(["EMPTY", "PARTIAL"]) &
|
||
(
|
||
(asset_list[PROPERTY_YEAR_BUILT] <= 1995) # TODO, Or if the EPC age band is < 1995
|
||
)
|
||
)
|
||
|
||
# asset_list["Suitable for Extraction"] =
|
||
asset_list[
|
||
(asset_list["Construction"] == "Cavity") &
|
||
asset_list["Insulated"].isin(["RETRO DRILLED"]) &
|
||
(
|
||
(asset_list[PROPERTY_YEAR_BUILT] <= 1995)
|
||
) &
|
||
(
|
||
asset_list[]
|
||
)
|
||
]
|
||
|
||
# 4) Flag properties that look like they're good candidates for solar installs
|
||
# Firstly, flag if the fabric is completely done
|
||
|
||
insulated_wall_substrings = [
|
||
", insulated", "with external insulation", "with internal insulation", "filled cavity"
|
||
]
|
||
|
||
insulated_roof_substrings = [
|
||
"(another dwelling above)", "limited insulation", "(other premises above)",
|
||
", no insulation",
|
||
]
|
||
|
||
def check_solar_insulation_conditions(x):
|
||
|
||
if pd.isnull(x["Wall Construction"]):
|
||
return None
|
||
|
||
if "average thermal transmittance" in x["Wall Construction"].lower():
|
||
# We extract out the u-values
|
||
wall_uvalue = extract_thermal_transmittance({}, x["Wall Construction"])[0]["thermal_transmittance"]
|
||
roof_uvalue = extract_thermal_transmittance({}, x["Roof Construction"])[0]["thermal_transmittance"]
|
||
floor_uvalue = extract_thermal_transmittance({}, x["Floor Construction"])[0]["thermal_transmittance"]
|
||
|
||
roof_uvalue = 0 if roof_uvalue is None else roof_uvalue
|
||
floor_uvalue = 0 if floor_uvalue is None else floor_uvalue
|
||
|
||
# We apply some cutoffs
|
||
if wall_uvalue < 0.7 and roof_uvalue < 0.7 and floor_uvalue < 0.7:
|
||
return "Walls, Roof and Floor have U-values below 0.7"
|
||
|
||
return "Confirm U-values"
|
||
|
||
walls_insulated = any(
|
||
insulated_substring in x["Wall Construction"].lower() for insulated_substring in insulated_wall_substrings
|
||
)
|
||
roof_is_numeric = False
|
||
if str(x["Roof Insulation Thickness"]).isdigit():
|
||
roof_is_numeric = True
|
||
roof_insulated = int(x["Roof Insulation Thickness"]) >= 200
|
||
else:
|
||
roof_insulated = any(
|
||
insulated_substring in x["Roof Construction"].lower() for insulated_substring in
|
||
insulated_roof_substrings
|
||
)
|
||
|
||
floor_is_solid = "solid" in x["Floor Construction"].lower()
|
||
|
||
if walls_insulated and roof_insulated and floor_is_solid:
|
||
return "Walls Insulated, Roof Insulated, Floor Solid"
|
||
|
||
if walls_insulated and floor_is_solid and roof_is_numeric:
|
||
return "Walls Insulated, Floor Solid, Loft need top-up"
|
||
|
||
return "Not Fully Insulated or no data"
|
||
|
||
asset_list["Solar Fabric Condition"] = asset_list.apply(check_solar_insulation_conditions, axis=1)
|
||
|
||
asset_list["Good Solar Candidate"] = (
|
||
asset_list["SAP Rating is 75 and below"] &
|
||
~asset_list["Has Solar PV"] &
|
||
(
|
||
asset_list["Heating Type"].isin(
|
||
[
|
||
"Electric storage heaters",
|
||
"Room heaters, electric",
|
||
]
|
||
) | asset_list["Heating Type"].str.contains("heat pump", case=False)
|
||
) & (
|
||
asset_list["Solar Fabric Condition"].isin(
|
||
[
|
||
"Walls Insulated, Roof Insulated, Floor Solid",
|
||
"Walls, Roof and Floor have U-values below 0.7",
|
||
"Walls Insulated, Floor Solid, Loft need top-up"
|
||
]
|
||
)
|
||
)
|
||
)
|
||
|
||
def flat_analysis(asset_list):
|
||
|
||
# We need to deduce the building name - we strip out the house number
|
||
def extract_building_name(x):
|
||
# TODO: This doesn't really work
|
||
if pd.isnull(x):
|
||
return None
|
||
house_no = SearchEpc.get_house_number(address=x, postcode=None)
|
||
if house_no:
|
||
return x.replace(house_no, "").strip()
|
||
return x.split(",")[0].strip()
|
||
|
||
# We want to deduce if flats have 50% of the properties below C75
|
||
# We group by postcode and property type
|
||
grouped = asset_list.groupby([POSTCODE_COLUMN, "Property Type"])
|
||
|
||
flat_data = []
|
||
for _, group in grouped:
|
||
if "flat" in group["Property Type"].str.lower().values:
|
||
num_flats = group["Property Type"].str.lower().value_counts().get("flat", 0)
|
||
num_below_c75 = group["SAP score on register"].lt(75).sum()
|
||
|
||
flat_data.append(
|
||
{
|
||
"Postcode": group[POSTCODE_COLUMN].iloc[0],
|
||
"Property Type": "Flat",
|
||
"Number of Flats with EPC": num_flats,
|
||
"Number of Flats below C75": num_below_c75,
|
||
"Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats)
|
||
}
|
||
)
|
||
|
||
flat_data = pd.DataFrame(flat_data)
|
||
|
||
return flat_data
|
||
|
||
flat_data = flat_analysis(asset_list)
|
||
|
||
# For all of the columns in transformed_df, prefix with "Recommendation: "
|
||
for col in transformed_df.columns:
|
||
if col == "row_id":
|
||
continue
|
||
transformed_df = transformed_df.rename(columns={col: f"Recommendation: {col}"})
|
||
|
||
asset_list = asset_list.merge(
|
||
transformed_df,
|
||
how="left",
|
||
on="row_id"
|
||
)
|
||
asset_list = asset_list.drop(columns=["row_id", "index"])
|
||
|
||
# Store as an excel
|
||
filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx"
|
||
# Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data
|
||
|
||
with pd.ExcelWriter(filename) as writer:
|
||
asset_list.to_excel(writer, sheet_name="EPC Data", index=False)
|
||
flat_data.to_excel(writer, sheet_name="Flat Data", index=False)
|
||
|
||
matches_review = asset_list[
|
||
[FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"]
|
||
]
|