debugging handling of year built

This commit is contained in:
Khalim Conn-Kowlessar 2025-04-01 22:43:50 +01:00
parent 1d0c8a3e43
commit 2d69c671d3
7 changed files with 327 additions and 51 deletions

View file

@ -368,7 +368,7 @@ class AssetList:
self.contact_detail_fields = None
self.outcomes = None
self.outcomes_no_match = None
self.outcomes_for_output = None
self.outcomes_for_output = pd.DataFrame()
self.master_surveyed = None
# We detect the presence of the non-intrusive columns
@ -701,6 +701,13 @@ class AssetList:
if match:
return int(match.group(1)) # Extract the year and convert to integer
if "-" in date_str:
# Count the number of times we have "-", as we've seen double ranges
# (when we have extensions) so the format is like this:
# 'G: 1983-1990, H: 1991-1995'
if date_str.count("-") == 2:
# We have a range
return int(date_str.split("-")[1].split(",")[0])
# We probably have a range
return int(date_str.split("-")[1].strip())
@ -1084,8 +1091,15 @@ class AssetList:
)
elif self.old_format_non_intrusives_present:
non_intrusives_wall_filter = (
self.standardised_asset_list['non-intrusives: WFT Findings'].str.lower().isin(
self.standardised_asset_list['non-intrusives: WFT Findings'].str.lower().str.strip().isin(
["empty cavity", "partial fill"]
) | (
(
self.standardised_asset_list['non-intrusives: WFT Findings']
.str.lower().str.strip().str.contains("empty cavity|partial fill") &
~self.standardised_asset_list['non-intrusives: WFT Findings']
.astype(str).str.lower().str.strip().str.contains("major access issues")
)
)
)
else:
@ -1114,6 +1128,20 @@ class AssetList:
(self.standardised_asset_list["epc_year_upper_bound"] <= 2002)
)
self.standardised_asset_list["non_intrusive_indicates_empty_cavity_has_solar"] = (
(~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) &
non_intrusives_wall_filter &
(self.standardised_asset_list["epc_year_upper_bound"] <= 2002) &
(
self.standardised_asset_list[
self.EPC_API_DATA_NAMES["current-energy-efficiency"]
] <= self.EMPTY_CAVITY_SAP_THRESHOLD
) & (
# If the property has solar, there's a chance it won't qualify
self.standardised_asset_list["property_has_solar"]
)
)
else:
self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = (
(~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) &
@ -1158,7 +1186,7 @@ class AssetList:
~self.standardised_asset_list["property_has_solar"]
)
# We also add a filter on anything that was generally identified by the none-intrusives
# We also add a filter on anything that was generally identified by the non-intrusives
self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter_no_year_filter"] = (
(~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) &
non_intrusives_wall_filter
@ -1290,7 +1318,8 @@ class AssetList:
print("Review these categories with Kieran")
extraction_wall_filter = (
self.standardised_asset_list['non-intrusives: WFT Findings'].str.lower().str.strip().isin(
["retro drilled", "retro filled", "fibre from build", "polybead"]
["retro drilled", "retro filled", "fibre from build", "polybead", "retro drilled and filled",
"retro drilled & filled", "blown in white wool", "blown in yellow wool"]
)
)
@ -1727,25 +1756,6 @@ class AssetList:
self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW]
)
self.standardised_asset_list["test"] = (
not_a_flat &
# Landlord data or EPC data indicates the heating system is appropriate
correct_heating_system &
# The property doesn't currently have solar
~self.standardised_asset_list["property_has_solar"] &
# The walls are insulated
walls_meet_solar_requirements &
# Roof is insulated
self.standardised_asset_list["solar_epc_roof_insulated"] &
# SAP below threshold
self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW]
)
self.standardised_asset_list["test"] = np.where(
self.standardised_asset_list["solar_eligible_other_floor"],
False,
self.standardised_asset_list["test"]
)
self.standardised_asset_list["solar_eligible_other_floor_sap_above_threshold"] = (
not_a_flat &
# Landlord data or EPC data indicates the heating system is appropriate
@ -1869,6 +1879,32 @@ class AssetList:
~self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW]
)
# Check if the boiler is electric
# We check if it contains both the terms boiler & electric
has_electric_boiler = (
(
self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]]
.str.lower().isin(
["boiler and radiators, electric", "boiler and underfloor heating, electric"])
) | (
self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM] == "electric boiler"
)
)
# We check for a specific sub-set of properties which are uninsulated solid wall properties that are EPC E
# or below (we'll use 57 as a threshold) - These are for a pilot with Net Zero Renewables
self.standardised_asset_list["solar_eligible_solid_wall_uninsulated"] = (
not_a_flat &
# Landlord data or EPC data indicates the heating system is appropriate - in this case, we can also take
# electric boilers
(correct_heating_system | has_electric_boiler) &
# The property doesn't currently have solar
~self.standardised_asset_list["property_has_solar"] &
# The walls are uninsulated solic
~walls_meet_solar_requirements &
(self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= 57)
)
# Drop anything we don't need
self.standardised_asset_list = self.standardised_asset_list.drop(
columns=["walls_u_value", "roof_u_value", "floor_u_value"]
@ -2009,7 +2045,8 @@ class AssetList:
),
"solar_eligible_other_floor_needs_loft_needs_heating_upgrade_sap_above_threshold": (
"Solar Eligible, Other Floor, Needs Loft, Needs Heating Upgrade, SAP Above Threshold"
)
),
"solar_eligible_solid_wall_uninsulated": "Solar Eligible, Solid Wall Uninsulated, EPC E or Below",
}
for variable, reason in solar_reason_map.items():
@ -2020,20 +2057,30 @@ class AssetList:
)
# Flag anything that has existing outcomes
if self.outcomes is not None:
self.standardised_asset_list["cavity_reason"] = np.where(
(
(self.standardised_asset_list["Surveyed"] > 0) |
(self.standardised_asset_list["Installer Refusal"] > 0)
),
None,
self.standardised_asset_list["cavity_reason"]
)
if (self.outcomes is not None) and ("Surveyed" in self.standardised_asset_list.columns):
if "Installer Refusal" not in self.standardised_asset_list.columns:
self.standardised_asset_list["cavity_reason"] = np.where(
(
(self.standardised_asset_list["Surveyed"] > 0)
),
None,
self.standardised_asset_list["cavity_reason"]
)
else:
self.standardised_asset_list["cavity_reason"] = np.where(
(
(self.standardised_asset_list["Surveyed"] > 0) |
(self.standardised_asset_list["Installer Refusal"] > 0)
),
None,
self.standardised_asset_list["cavity_reason"]
)
if self.master_surveyed is not None:
self.standardised_asset_list["cavity_reason"] = np.where(
(
(~pd.isnull(self.standardised_asset_list["SUBMISSION DATE"]))
(~pd.isnull(self.standardised_asset_list["submission_date"]))
),
None,
self.standardised_asset_list["cavity_reason"]
@ -2064,9 +2111,11 @@ class AssetList:
~pd.isnull(self.standardised_asset_list["cavity_reason"]) |
~pd.isnull(self.standardised_asset_list["solar_reason"])
][self.DOMNA_PROPERTY_ID].values
self.outcomes_for_output = self.outcomes[
self.outcomes[self.DOMNA_PROPERTY_ID].isin(identified_work)
]
if self.DOMNA_PROPERTY_ID in self.outcomes.columns:
self.outcomes_for_output = self.outcomes[
self.outcomes[self.DOMNA_PROPERTY_ID].isin(identified_work)
]
def flat_analysis(self):
@ -2398,6 +2447,7 @@ class AssetList:
self,
outcomes_filepath,
outcomes_sheetname,
outcomes_address,
outcomes_postcode,
outcomes_houseno,
outcomes_id
@ -2408,6 +2458,12 @@ class AssetList:
self.outcomes = pd.read_excel(outcomes_filepath, sheet_name=outcomes_sheetname)
self.outcomes["row_id"] = self.outcomes.index
if outcomes_houseno is None:
outcomes_houseno = "houseno"
self.outcomes["houseno"] = self.outcomes[outcomes_address].apply(
lambda x: SearchEpc.get_house_number(x, self.outcomes[outcomes_postcode])
)
logger.info("Matching outcomes to asset list")
# Merge the outcomes onto the asset list - we check we're able to match sufficiently well
lookup = []
@ -2433,7 +2489,7 @@ class AssetList:
)
continue
address_clean = x["Address"].lower().replace(",", "").replace(" ", " ")
address_clean = x[outcomes_address].lower().replace(",", "").replace(" ", " ")
matched = self.standardised_asset_list[
(self.standardised_asset_list[
@ -2451,13 +2507,14 @@ class AssetList:
continue
matched = self.standardised_asset_list[
(self.standardised_asset_list[self.STANDARD_POSTCODE] == x[outcomes_postcode])
(self.standardised_asset_list[self.STANDARD_POSTCODE].str.strip() == x[outcomes_postcode])
].copy()
if not matched.empty:
matched["houseno"] = matched.apply(
lambda x: SearchEpc.get_house_number(x[self.STANDARD_ADDRESS_1], x[self.STANDARD_POSTCODE]),
axis=1
)
matched = matched[
matched["houseno"].astype(str) == str(x[outcomes_houseno])
]
@ -2469,6 +2526,8 @@ class AssetList:
}
)
continue
elif matched.shape[0] > 1:
raise NotImplementedError("Check me")
elif not matched.empty:
# Use levenstein distance to match
matched["address"] = matched[self.STANDARD_ADDRESS_1] + " " + matched[self.STANDARD_POSTCODE]
@ -2498,6 +2557,7 @@ class AssetList:
# happened multiple times, in this case we judge that the work may not be viable
date_col = "Week Commencing" if "Week Commencing" in self.outcomes else "Survey Date"
lookup = lookup.merge(
self.outcomes[["row_id", "Outcome", "Notes", date_col]], how="left", on="row_id"
)
@ -2568,7 +2628,13 @@ class AssetList:
# We just need to check if any were cancelled
master_to_append = master_data[
["UPRN", install_col, submission_col]
].rename(columns={"UPRN": self.STANDARD_LANDLORD_PROPERTY_ID, install_col: "survey_status"})
].rename(
columns={
"UPRN": self.STANDARD_LANDLORD_PROPERTY_ID,
install_col: "survey_status",
submission_col: "submission_date"
}
)
master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel")
master_surveyed.append(master_to_append)

View file

@ -88,6 +88,32 @@ def app():
# - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
# - Or the insulation required is loft/cavity (floors should be solid)
# Southern Midlands
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern/Midlands Properties - Apr 2025"
data_filename = "Southern Housing Midlands Property List - combined.xlsx"
sheet_name = "Sheet 1"
postcode_column = 'Post Code'
fulladdress_column = "Address"
address1_column = None
address1_method = "house_number_extraction"
address_cols_to_concat = []
missing_postcodes_method = None
landlord_year_built = "Age_1"
landlord_os_uprn = None
landlord_property_type = "Prop_Type"
landlord_built_form = "Prop_Type"
landlord_wall_construction = "Walls_P"
landlord_heating_system = "Heating System"
landlord_existing_pv = None
landlord_property_id = "AssetID"
outcomes_filename = None
outcomes_sheetname = None
outcomes_postcode = None
outcomes_houseno = None
outcomes_id = None
master_filepaths = []
master_to_asset_list_filepath = None
# Live West (2018 Asset list)
data_folder = (
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/Programme Update - March 2025/2018 Asset List"
@ -149,6 +175,84 @@ def app():
]
master_to_asset_list_filepath = None
# PFP London
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/London"
data_filename = "PFP AREAS SURROUNDING LONDON - JAY, RUTH & LANE.xlsx"
sheet_name = "PFP SURROUNDING LONDON"
postcode_column = 'Postcode'
fulladdress_column = None
address1_column = "AddressLine1"
address1_method = None
address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"]
missing_postcodes_method = None
landlord_year_built = None
landlord_os_uprn = None
landlord_property_type = "Archetype (PFP)"
landlord_built_form = "Archetype (PFP)"
landlord_wall_construction = None
landlord_heating_system = None
landlord_existing_pv = None
landlord_property_id = "Uprn"
outcomes_filename = None
outcomes_sheetname = None
outcomes_postcode = None
outcomes_houseno = None
outcomes_id = None
master_filepaths = []
master_to_asset_list_filepath = None
# PFP North-West
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/North-West"
data_filename = "Places for People NORTH WEST - INSPECTIONS MASTER - UPDATE.xlsx"
sheet_name = "CHECKED"
postcode_column = 'Postcode'
fulladdress_column = None
address1_column = "AddressLine1"
address1_method = None
address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"]
missing_postcodes_method = None
landlord_year_built = None
landlord_os_uprn = None
landlord_property_type = "Archetype (PFP)"
landlord_built_form = "Archetype (PFP)"
landlord_wall_construction = None
landlord_heating_system = None
landlord_existing_pv = None
landlord_property_id = "Uprn"
outcomes_filename = None
outcomes_sheetname = None
outcomes_postcode = None
outcomes_houseno = None
outcomes_id = None
master_filepaths = []
master_to_asset_list_filepath = None
# PFP North-East
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/North-East"
data_filename = "Places for People NORTH EAST - INSPECTIONS MASTER.xlsx"
sheet_name = "CHECKED"
postcode_column = 'Postcode'
fulladdress_column = None
address1_column = "AddressLine1"
address1_method = None
address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"]
missing_postcodes_method = None
landlord_year_built = None
landlord_os_uprn = None
landlord_property_type = "Archetype (PFP)"
landlord_built_form = "Archetype (PFP)"
landlord_wall_construction = None
landlord_heating_system = None
landlord_existing_pv = None
landlord_property_id = "Uprn"
outcomes_filename = None
outcomes_sheetname = None
outcomes_postcode = None
outcomes_houseno = None
outcomes_id = None
master_filepaths = []
master_to_asset_list_filepath = None
# PFP East
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/East"
data_filename = "PFP EAST - Master - DN LN NG NR PE POSTCODES.xlsx"
@ -171,6 +275,7 @@ def app():
outcomes_sheetname = None
outcomes_postcode = None
outcomes_houseno = None
outcomes_id = None
master_filepaths = []
master_to_asset_list_filepath = None
@ -264,6 +369,7 @@ def app():
outcomes_houseno = None
master_filepaths = []
master_to_asset_list_filepath = None
outcomes_id = None
# For ACIS - programme re-build
# data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/ACIS/ACIS Full Programme Review March 2025"
@ -386,6 +492,7 @@ def app():
asset_list.flag_outcomes(
outcomes_filepath=os.path.join(data_folder, outcomes_filename) if outcomes_filename else None,
outcomes_sheetname=outcomes_sheetname,
outcomes_address=outcomes_address,
outcomes_postcode=outcomes_postcode,
outcomes_houseno=outcomes_houseno,
outcomes_id=outcomes_id
@ -403,7 +510,7 @@ def app():
epc_api_only = False
force_retrieve_data = False
skip = None # Used to skip already completed chunks
chunk_size = 2000
chunk_size = 5000
filename = "Chunk {i}.csv"
download_folder = os.path.join(data_folder, "Chunks")
if not os.path.exists(download_folder):
@ -418,6 +525,9 @@ def app():
if all(x in folder_contents for x in downloaded_files):
skip = max(chunk_indexes)
if any(x in folder_contents for x in downloaded_files):
skip = max([i for i in chunk_indexes if filename.format(i=i) in folder_contents])
# folder_contents = [f for f in folder_contents if "nodata" not in f and f.endswith(".csv")]
for i in range(0, len(asset_list.standardised_asset_list), chunk_size):
@ -582,8 +692,6 @@ def app():
)
cleaned = msgpack.unpackb(cleaned, raw=False)
# TODO: We should break out the identification of work types to flag blocks of flats specifically
# TODO: Append existing outcomes onto the sheet.
asset_list.identify_worktypes(cleaned)
pprint(asset_list.work_type_figures)
@ -729,7 +837,7 @@ def app():
asset_list.standardised_asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False)
asset_list.flat_data.to_excel(writer, sheet_name="Flat Data", index=False)
# If we have outcomes, we add a tab with the outcomes
if asset_list.outcomes_for_output is not None:
if not asset_list.outcomes_for_output.empty:
asset_list.outcomes_for_output.to_excel(writer, sheet_name="Outcomes", index=False)
# Store the Hubspot export as a csv

View file

@ -1,3 +1,5 @@
import numpy as np
STANDARD_BUILT_FORMS = {
"unknown",
# Houses
@ -54,6 +56,13 @@ BUILT_FORM_MAPPINGS = {
'Coach House': 'detached',
'Office Buildings': 'unknown',
'Maisonnette': 'mid-floor',
'Bedspace': 'unknown'
'Bedspace': 'unknown',
'Studio (3rd floor and above)': 'top-floor',
'Adapted Property For Disabled': 'unknown',
'Studio (2nd floor)': 'mid-floor',
np.nan: 'unknown',
'Third Floor Flat': 'mid-floor',
'2 Ext. Wall Flat': 'mid-terrace',
'Hostel': 'unknown'
}

View file

@ -26,7 +26,7 @@ STANDARD_HEATING_SYSTEMS = {
HEATING_MAPPINGS = {
"Combi - GAS": "gas combi boiler",
"E7 Storage Heaters": "electric storage heaters",
"E7 Storage Heaters": "high heat retention storage heaters",
"District heating system": "district heating",
"Condensing Boiler - GAS": "gas condensing boiler",
"Boiler Oil/other": "oil boiler",
@ -44,7 +44,7 @@ HEATING_MAPPINGS = {
"Gas fire": "other",
"Backboiler - Solid fuel": "other",
'combi - gas': 'gas combi boiler',
'e7 storage heaters': 'electric storage heaters',
'e7 storage heaters': 'high heat retention storage heaters',
'district heating system': 'district heating',
'condensing boiler - gas': 'gas condensing boiler',
'boiler oil/other': 'oil boiler',
@ -117,10 +117,17 @@ HEATING_MAPPINGS = {
'Mains Gas (Communal)': 'communal gas boiler',
'LPG': 'boiler - other fuel',
'Mains Gas': 'gas condensing boiler',
'ELECTRIC': 'electric fuel',
'OIL': 'oil fuel',
'SOLID FUEL': 'solid fuel',
'GAS': 'gas combi boiler',
'DO NOT SURVEY': 'unknown'
'DO NOT SURVEY': 'unknown',
'Gas Boiler': 'gas combi boiler',
'Communal Gas ': 'communal gas boiler',
'Communal': 'communal gas boiler',
'Communal Gas': 'communal gas boiler',
'Wood Burning Boiler': "boiler - other fuel",
'Oil Fired Boiler': 'oil boiler'
}

View file

@ -106,7 +106,6 @@ PROPERTY_MAPPING = {
'Office Buildings': 'unknown',
'Semi Detached Bung': 'bungalow',
'Bedspace': 'bedsit',
'Houses/Bungalows': 'bungalow',
'Bedsits': 'bedsit',
'Unknown': 'unknown',
@ -114,6 +113,12 @@ PROPERTY_MAPPING = {
'House/Bungalow ': 'bungalow',
'Low/Med Rise Flats/Mais': 'flat',
'Staff/Comm': 'other',
'A Rooms': 'other'
'A Rooms': 'other',
'Studio (3rd floor and above)': 'flat',
'Adapted Property For Disabled': 'unknown',
'Studio (2nd floor)': 'flat',
'Third Floor Flat': 'flat',
'2 Ext. Wall Flat': 'flat',
'Hostel': 'other'
}

View file

@ -0,0 +1,71 @@
"""
Rough script to get the EPC data for Benyon
"""
import pandas as pd
import os
from dotenv import load_dotenv
from backend.SearchEpc import SearchEpc
from asset_list.utils import get_data
load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
asset_list = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Benyon Estate/List of All Properties ecl Grd Rents in "
"Alphabetical Order.xlsx",
header=1
)
asset_list.columns = ["tennancy", "landlord_id", "landlord_address"]
# Get postcode as the last 2 parts of the address, split on space
asset_list["postcode"] = asset_list["landlord_address"].apply(lambda x: x.split(" ")[-2] + " " + x.split(" ")[-1])
asset_list["house_no"] = asset_list.apply(
lambda x: SearchEpc.get_house_number(address=x["landlord_address"], postcode=x["postcode"]), axis=1
)
epc_data, errors, no_epc = get_data(
df=asset_list,
manual_uprn_map={},
epc_auth_token=EPC_AUTH_TOKEN,
uprn_column=None,
fulladdress_column="landlord_address",
address1_column="house_no",
postcode_column="postcode",
property_type_column=None,
built_form_column=None,
epc_api_only=True,
row_id_name="landlord_id",
)
df = asset_list[asset_list["landlord_id"].isin(no_epc)]
epc_df = pd.DataFrame(epc_data)
epc_df["current-energy-rating"].value_counts()
epc_df["property-type"].value_counts()
epc_df["walls-description"].value_counts(normalize=True)
asset_list = asset_list.merge(
epc_df[
[
"landlord_id", "current-energy-rating", "property-type", "total-floor-area", "roof-description",
"walls-description", "co2-emissions-current"
]
],
how="left",
left_on="landlord_id",
right_on="landlord_id"
)
asset_list.to_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Benyon Estate/asset_list.csv", index=False
)
asset_list_big = asset_list.merge(
epc_df,
how="left",
left_on="landlord_id",
right_on="landlord_id"
)
asset_list_big.to_csv(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Benyon Estate/asset_list_full_data.csv",
index=False
)

View file

@ -332,6 +332,16 @@ class RetrieveFindMyEpc:
"Replacement warm air unit": [],
"Secondary glazing": ["secondary_glazing"],
"Condensing heating unit": ["boiler_upgrade"],
'???': [],
'Solar photovoltaic panels, 2.5kWp': ["solar_pv"],
'Heating controls (programmer, room thermostat and thermostatic radiator valves)': [
"roomstat_programmer_trvs", "time_temperature_zone_control"
],
'Translation missing: en.improvement_code.41.title': [],
"Condensing boiler (separate from the range cooker)": ["boiler_upgrade"],
"Heating controls (programmer and thermostatic radiator valves)": [
"roomstat_programmer_trvs", "time_temperature_zone_control"
]
}
survey = True