getting ready to work on the colchester data

This commit is contained in:
Khalim Conn-Kowlessar 2025-02-24 16:11:02 +00:00
parent 7b4218299f
commit 99a0948e2b
2 changed files with 78 additions and 87 deletions

View file

@ -343,6 +343,7 @@ class AssetList:
self.standardised_asset_list = self.raw_asset_list.copy()
# Will be used to store aggregated figures against the various work types
self.work_type_figures = {}
self.flat_data = None
# We detect the presence of the non-intrusive columns
self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False
@ -649,6 +650,9 @@ class AssetList:
logger.info("Applying standardisation to asset list")
for variable, mapping in self.variable_mappings.items():
self.standardised_asset_list[variable + "_original_from_landlord"] = (
self.standardised_asset_list[variable].copy()
)
self.standardised_asset_list[variable] = self.standardised_asset_list[variable].map(mapping)
if self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum():
@ -663,6 +667,12 @@ class AssetList:
# Apply renames to our standard names
# Perform final variable selection and renaming:
# We add the original columns to the keep variables
self.keep_variables += [
k + "_original_from_landlord" for k in self.variable_mappings.keys()
]
self.standardised_asset_list = self.standardised_asset_list[self.keep_variables].rename(
columns=self.rename_map
)
@ -912,18 +922,6 @@ class AssetList:
self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= self.EMPTY_CAVITY_SAP_THRESHOLD
)
)
self.standardised_asset_list["empty_cavity"] = (
self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] |
self.standardised_asset_list["epc_indicates_empty_cavity"]
)
# We add a reason
self.standardised_asset_list["empty_cavity_reason"] = np.where(
self.standardised_asset_list["non_intrusive_indicates_empty_cavity"],
"Non-Intrusive Data",
"EPC Data"
)
######################################################
# Extraction
######################################################
@ -933,7 +931,7 @@ class AssetList:
self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = (
(self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") &
(self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) &
(~self.standardised_asset_list['non-intrusives: Material'].isin(["GREY LOOSE BEAD", "FORMALDEHYDE"])
(~self.standardised_asset_list['non-intrusives: Material'].isin(["GREY LOOSE BEAD", "COMPACTED BEAD"])
) & (
self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW]
)
@ -996,6 +994,12 @@ class AssetList:
)
)
self.standardised_asset_list["solar_non_intrusives_walls_insulated"] = (
self.standardised_asset_list["non-intrusives: Insulated"].isin(
["EWI", "RETRO DRILLED", "FILLED AT BUILD"]
)
)
# TODO: We don't have information about the roof from this landlord
# We merge on the u-value for average thermal transmittance
@ -1146,7 +1150,8 @@ class AssetList:
# The walls are insulated
(
self.standardised_asset_list["solar_landlord_walls_insulated"] |
self.standardised_asset_list["solar_epc_walls_insulated"]
self.standardised_asset_list["solar_epc_walls_insulated"] |
self.standardised_asset_list["solar_non_intrusives_walls_insulated"]
) &
# Roof is insulated
self.standardised_asset_list["solar_epc_roof_insulated"] &
@ -1165,7 +1170,8 @@ class AssetList:
# The walls are insulated
(
self.standardised_asset_list["solar_landlord_walls_insulated"] |
self.standardised_asset_list["solar_epc_walls_insulated"]
self.standardised_asset_list["solar_epc_walls_insulated"] |
self.standardised_asset_list["solar_non_intrusives_walls_insulated"]
) &
# Roof is insulated
self.standardised_asset_list["solar_epc_loft_needs_topup"] &
@ -1216,6 +1222,15 @@ class AssetList:
columns=["walls_u_value", "roof_u_value", "floor_u_value"]
)
# Adjust flagged extraction jobs to remove anything for solar
self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = (
self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] &
~self.standardised_asset_list["solar_eligible_solid_floor"] &
~self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"]
# ~self.standardised_asset_list["solar_eligible_other_floor"] &
# ~self.standardised_asset_list["solar_eligible_other_floor_needs_loft"]
)
# Produce some aggregate figures
self.work_type_figures = {
# Empty cavity from non-intrusives
@ -1296,3 +1311,40 @@ class AssetList:
"Other Floor, Insulated, Needs Loft",
self.standardised_asset_list["solar_reason"]
)
def flat_analysis(self):
# We need to deduce the building name - we strip out the house number
# We want to deduce if flats have 50% of the properties below C75
# We group by postcode and property type
grouped = self.standardised_asset_list.groupby(
[self.STANDARD_POSTCODE, self.STANDARD_PROPERTY_TYPE]
)
flat_data = []
for _, group in grouped:
if "flat" in group[self.STANDARD_PROPERTY_TYPE].values:
num_flats = group[self.STANDARD_PROPERTY_TYPE].shape[0]
num_below_c75 = group[
self.EPC_API_DATA_NAMES["current-energy-efficiency"]
].lt(self.FILLED_CAVITY_SAP_THRESHOLD).sum()
# Check if any flats are below C69
num_flats_below_c69 = group[
self.EPC_API_DATA_NAMES["current-energy-efficiency"]
].lt(69).sum()
flat_data.append(
{
"Postcode": group[self.STANDARD_POSTCODE].iloc[0],
"Property Type": "Flat",
"Number of Flats with EPC": num_flats,
"Number of Flats below C75": num_below_c75,
"Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats),
"Number of Flats Below C69": num_flats_below_c69,
}
)
flat_data = pd.DataFrame(flat_data)
self.flat_data = flat_data

View file

@ -4,6 +4,7 @@ import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from pprint import pprint
import msgpack
from utils.s3 import read_from_s3
from asset_list.AssetList import AssetList
@ -239,23 +240,18 @@ def app():
# - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
# - Or the insulation required is loft/cavity (floors should be solid)
# For Westward
DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward"
DATA_FILENAME = "WESTWARD - completed list..xlsx"
DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
SHEET_NAME = "Sheet1"
POSTCODE_COLUMN = "WFT EDIT Postcode"
FULLADDRESS_COLUMN = "Address"
POSTCODE_COLUMN = 'Full Address.1'
FULLADDRESS_COLUMN = "Full Address"
ADDRESS1_COLUMN = None
ADDRESS1_METHOD = "house_number_extraction"
ADDRESS1_METHOD = "first_word"
ADDRESS_COLS_TO_CONCAT = []
MISSING_POSTCODES_METHOD = None
PROPERTY_YEAR_BUILT = "Build date"
UPRN_COLUMN = "UPRN"
# If we have the non-intrusives data, this should be true
HAS_NON_INTRUSIVES = True
PROPERTY_TYPE_COLUMN = "Location type" # This will be used to identify and remove bedsits
PROPERTY_YEAR_BUILT = "Build Date"
UPRN_COLUMN = None
PROPERTY_TYPE_COLUMN = None
# Maps addresses to uprn in problematic cases
MANUAL_UPRN_MAP = {}
@ -297,20 +293,6 @@ def app():
asset_list.apply_standardiation()
# DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
# DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
# SHEET_NAME = "Sheet1"
# POSTCODE_COLUMN = 'Full Address.1'
# FULLADDRESS_COLUMN = "Full Address"
# ADDRESS1_COLUMN = None
# ADDRESS1_METHOD = "first_word"
# ADDRESS_COLS_TO_CONCAT = []
# MISSING_POSTCODES_METHOD = None
# PROPERTY_YEAR_BUILT = "Build Date"
# UPRN_COLUMN = None
# # If we have the non-intrusives data, this should be true
# HAS_NON_INTRUSIVES = True
### We retrieve the EPC data
# We chunk up this data into 5000 rows at a time
@ -455,48 +437,9 @@ def app():
asset_list.identify_worktypes(cleaned)
from pprint import pprint
pprint(asset_list.work_type_figures)
# TODO: We should do this breakdown for flats
def flat_analysis(asset_list):
# We need to deduce the building name - we strip out the house number
# We want to deduce if flats have 50% of the properties below C75
# We group by postcode and property type
grouped = asset_list.standardised_asset_list.groupby(
[asset_list.STANDARD_POSTCODE, asset_list.STANDARD_PROPERTY_TYPE]
)
flat_data = []
for _, group in grouped:
if "flat" in group[asset_list.STANDARD_PROPERTY_TYPE].values:
num_flats = group[asset_list.STANDARD_PROPERTY_TYPE].shape[0]
num_below_c75 = group[
asset_list.EPC_API_DATA_NAMES["current-energy-efficiency"]
].lt(asset_list.FILLED_CAVITY_SAP_THRESHOLD).sum()
# Check if any flats are below C69
num_flats_below_c69 = group[
asset_list.EPC_API_DATA_NAMES["current-energy-efficiency"]
].lt(69).sum()
flat_data.append(
{
"Postcode": group[asset_list.STANDARD_POSTCODE].iloc[0],
"Property Type": "Flat",
"Number of Flats with EPC": num_flats,
"Number of Flats below C75": num_below_c75,
"Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats),
"num_flats_below_c69": num_flats_below_c69,
}
)
flat_data = pd.DataFrame(flat_data)
return flat_data
flat_data = flat_analysis(asset_list)
asset_list.flat_analysis()
# Store as an excel
filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " - Standardised.xlsx"
@ -504,8 +447,4 @@ def app():
with pd.ExcelWriter(filename) as writer:
asset_list.standardised_asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False)
flat_data.to_excel(writer, sheet_name="Flat Data", index=False)
matches_review = asset_list[
[FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"]
]
asset_list.flat_data.to_excel(writer, sheet_name="Flat Data", index=False)