working on unitas data standardisation

This commit is contained in:
Khalim Conn-Kowlessar 2025-05-07 17:43:14 +01:00
parent 96fb10390b
commit 5848cb5314
6 changed files with 168 additions and 57 deletions

View file

@ -5,6 +5,7 @@ import tiktoken
from pprint import pprint
from datetime import datetime
from numpy.ma.core import masked_not_equal
from openai import OpenAI
import numpy as np
import pandas as pd
@ -2179,7 +2180,7 @@ class AssetList:
return
# TODO: Fetch from Sharepoint
ecosurv_filepath = "/Users/khalimconn-kowlessar/Documents/hestia/Ecosurv/15.04.csv"
ecosurv_filepath = "/Users/khalimconn-kowlessar/Documents/hestia/Ecosurv/07.05.2025.csv"
logger.info("Getting Ecosurv data from %s", ecosurv_filepath)
self.ecosurv = pd.read_csv(ecosurv_filepath, encoding="cp437")
@ -2310,7 +2311,7 @@ class AssetList:
nomatch_i = []
for _, x in tqdm(outcomes.iterrows(), total=len(outcomes)):
if pd.isnull(x[outcomes_address[idx]]):
if pd.isnull(x[outcomes_address[idx]]) or not x[outcomes_address[idx]]:
continue
# Check if we have an id
@ -2448,7 +2449,7 @@ class AssetList:
return pd.to_datetime(match.group(1), format="%d.%m.%Y", errors="coerce")
return pd.NaT
lookup['parsed_date'] = lookup['Date letters sent'].apply(extract_date)
lookup['parsed_date'] = lookup[date_col].apply(extract_date)
def get_latest_note(group):
surveyed = group[group['Outcome'] == 'surveyed']
@ -2457,8 +2458,11 @@ class AssetList:
else:
return group.sort_values('parsed_date', ascending=False).iloc[0]
latest_note = lookup.groupby('domna_property_id', group_keys=False).apply(get_latest_note).reset_index(
drop=True)
latest_note = (
lookup.groupby('domna_property_id', group_keys=False).
apply(get_latest_note).
reset_index(drop=True)
)
latest_note = latest_note[["domna_property_id", notes_col]]
pivot_df = lookup.groupby(["domna_property_id", "Outcome"]).size().unstack(fill_value=0).reset_index()
@ -2513,36 +2517,43 @@ class AssetList:
# Strip columns
master_data.columns = [c.strip() for c in master_data.columns]
master_data.columns = [re.sub(r'\s+', ' ', c) for c in master_data.columns]
# Drop any unnamed columns
unnamed_columns = [c for c in master_data.columns if "Unnamed:" in c]
master_data = master_data.drop(columns=unnamed_columns)
if not id_map.empty:
master_data = master_data.merge(
id_map, how="left", on=['NO.', 'Street / Block Name', 'Post Code']
)
install_col = (
"INSTALLED OR CANCELLED" if "INSTALLED OR CANCELLED" in master_data.columns
else "INSTALL / CANCELLATION DATE"
)
if "INSTALLED OR CANCELLED" in master_data.columns:
install_col = "INSTALLED OR CANCELLED"
elif "INSTALL / CANCELLATION DATE" in master_data.columns:
install_col = "INSTALL / CANCELLATION DATE"
elif 'INSTALL/ CANCELLATION DATE' in master_data.columns:
install_col = 'INSTALL/ CANCELLATION DATE'
else:
raise ValueError("No install or cancellation date")
submission_col = (
"SUBMISSION DATE" if "SUBMISSION DATE" in master_data.columns else "SUBMISSION DATE TO INSTALLERS"
)
if "UPRN" in master_data.columns:
# We just need to check if any were cancelled
master_to_append = master_data[
["UPRN", install_col, submission_col]
].rename(
columns={
"UPRN": self.STANDARD_LANDLORD_PROPERTY_ID,
install_col: "survey_status",
submission_col: "submission_date"
}
)
master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel")
master_surveyed.append(master_to_append)
continue
# if "UPRN" in master_data.columns:
# # We just need to check if any were cancelled
# master_to_append = master_data[
# ["UPRN", install_col, submission_col]
# ].rename(
# columns={
# "UPRN": self.STANDARD_LANDLORD_PROPERTY_ID,
# install_col: "survey_status",
# submission_col: "submission_date"
# }
# )
# master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel")
#
# master_surveyed.append(master_to_append)
# continue
master_data["row_id"] = master_data.index
@ -2557,23 +2568,35 @@ class AssetList:
house_no_col = 'NO.' if 'NO.' in master_data.columns else "NO"
# Otherwise, we need to match algorithmically
has_property_id = "UPRN" in master_data.columns
logger.info("Matching master data to asset list")
matched = []
unmatched = []
for _, row in tqdm(master_data.iterrows(), total=len(master_data)):
if pd.isnull(row[postcode_col]):
continue
# if has_property_id:
# submission_uprn = row["UPRN"]
#
# if not pd.isnull(submission_uprn):
# df = self.standardised_asset_list[
# self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID] == submission_uprn
# ]
postcode_no_space = row[postcode_col].strip().replace(" ", "").lower()
df = self.standardised_asset_list[
(
self.standardised_asset_list[self.STANDARD_POSTCODE].str.strip().str.lower().str.replace(" ",
"")
== postcode_no_space
self.standardised_asset_list[self.STANDARD_POSTCODE]
.str.strip().str.lower().str.replace(" ", "") == postcode_no_space
)
]
house_no = row[house_no_col]
if isinstance(house_no, float):
house_no = str(int(house_no))
if house_no in df["house_no"].values:
df = df[df["house_no"] == house_no]

View file

@ -62,32 +62,42 @@ def app():
Property UPRN
"""
# TODO:
# For cavity work:
# - Flag any entries that have a different wall type between non-intrusive data against EPC
# - Worth double checking entries that have a difference in wall construction
# - Look at anything that is flagged as an empty cavity but the EPC data says its a filled cavity
# - Look at the current EPC scores - Anything that is C75 or above, especially if its assumed no insulation
# - By postcode, we can try and deduce if all of the addresses are a flats and then estimate if 50% of the flats
# are less than C75
# - Flag anything pre SAP2012
# - Flag anything over 5 years old
# - Look at year built vs age band
#
# For Solar:
# - Discount any that have solar PV - based on non-intrusives and from the inspections team
# - In the heating, discount anything that isnt ashp, ghsp, hhrs, electric storage - possibly homes with
# electric room heaters but it might need to be an EPC E
# - Fabric - check the floor, wall and roof:
# - Filled or empty cavity is good
# - Insulated solid/timber/system built is good
# - SCIS/CEG needs solid floors
# - JJC dont care
# - Anything with a loft 200 or below
# - Anything C75 and above wont qualify
# - Insulated loft = 200mm
# - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
# - Or the insulation required is loft/cavity (floors should be solid)
# Unitas
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unitas"
data_filename = "UNITAS - Asset List.xlsx"
sheet_name = "Asset List"
postcode_column = 'Post Code'
fulladdress_column = "Address Line 1"
address1_column = "Address Line 1"
address1_method = None
address_cols_to_concat = []
missing_postcodes_method = None
landlord_year_built = "built year"
landlord_os_uprn = None
landlord_property_type = "Property Type"
landlord_built_form = "Expanded Property Type"
landlord_wall_construction = None
landlord_roof_construction = "loft insulation"
landlord_heating_system = "Bolier Make"
landlord_existing_pv = None
landlord_property_id = "Property Reference"
landlord_sap = "Sap Rating"
outcomes_filename = [
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unitas/Unitas - All outcomes - 24.04.2025.xlsx",
]
outcomes_sheetname = ["Feedback"]
outcomes_postcode = ["Postcode"]
outcomes_houseno = ["No."]
outcomes_id = [None]
outcomes_address = ["Address"]
master_filepaths = [
os.path.join(data_folder, "Submissions ECO 3.csv"),
os.path.join(data_folder, "Submissions ECO 4 - PHASE 1.csv"),
os.path.join(data_folder, "Submissions ECO 4 - PHASE 2.csv")
]
master_to_asset_list_filepath = None
phase = False
ecosurv_landlords = "unitas|everill|baskeyfield"
# LHP:
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/LHP"

View file

@ -212,5 +212,43 @@ BUILT_FORM_MAPPINGS = {
'5 Ext. Wall Flat': 'unknown',
'Unknown': 'unknown',
'Enclosed mid-terrace': 'mid-terrace',
'Enclosed end-terrace': 'end-terrace'
'Enclosed end-terrace': 'end-terrace',
'House GROUND FLOOR': 'ground floor',
'Flat? GROUND FLOOR': 'ground floor',
'House SD SEMI DETACHED': 'semi-detached',
'House SEMI DETACHED': 'semi-detached',
'Flat GROUND FLOOR': 'ground floor',
'': 'unknown',
'Flat SEVENTH FLOOR': 'mid-floor',
'House D': 'detached',
'House ET': 'end-terrace',
'House SD Homeless Unit': 'semi-detached',
'House MT Homeless Unit': 'mid-terrace',
'Bungalow ET': 'end-terrace',
'Bungalow D': 'detached',
'House SD': 'semi-detached',
'Bungalow Sheltered Accomodation': 'unknown',
'House. SD': 'semi-detached',
'Flat FIRST FLOOR MAISONETTE': 'ground floor',
'Bungalow SD': 'semi-detached',
'Flat FIRST FLOOR': 'ground floor',
'Flat Sheltered Accomodation': 'unknown',
'Flat SIXTH FLOOR': 'mid-floor',
'Flat EIGHTH FLOOR': 'mid-floor',
'Flat FOURTH FLOOR': 'mid-floor',
'Flat Homeless Unit': 'unknown',
'Bungalow MT': 'mid-terrace',
'Bungalow Homeless Unit': 'unknown',
'House MT': 'mid-terrace',
'Flat FIFTH FLOOR': 'mid-floor',
'Flat NINTH FLOOR': 'mid-floor',
'House SD FIRST FLOOR': 'semi-detached',
'Bungalow Supported housing': 'unknown',
'Flat THIRD FLOOR': 'mid-floor',
'Flat SECOND FLOOR': 'mid-floor',
'House Homeless Unit': 'unknown',
'Flat ELEVENTH FLOOR': 'mid-floor',
'Flat TENTH FLOOR': 'mid-floor',
'House. MT': 'mid-terrace'
}

View file

@ -263,5 +263,18 @@ HEATING_MAPPINGS = {
'Oil Standard Boiler Heating': 'oil boiler',
'Oil Condensing Boiler Heating': 'oil boiler',
'Electric ASHP': 'air source heat pump',
'Modern Slimline Storage Heaters': 'electric storage heaters'
'Modern Slimline Storage Heaters': 'electric storage heaters',
# These are boiler makes from Unitas
'UNKNOWN': 'unknown',
'IDEAL': 'gas combi boiler',
'VAILLANT': 'gas combi boiler',
'THORN': 'gas combi boiler',
'WORCESTER': 'gas combi boiler',
'GLOW WORM': 'gas combi boiler',
'VOKERA': 'gas combi boiler',
'POTTERTON': 'gas combi boiler',
'BAXI SOLO': 'gas combi boiler',
'BAXI BERMUDA': 'gas combi boiler',
'BAXI': 'gas combi boiler'
}

View file

@ -206,5 +206,18 @@ PROPERTY_MAPPING = {
'02 FLAT': 'flat',
'04 MAISONETTE': 'maisonette',
'01 HOUSE MID': 'house',
'03 BUNGALOW': 'bungalow'
'03 BUNGALOW': 'bungalow',
'Flat?': 'flat',
'Bungalow ET': 'bungalow',
'House. SD': 'house',
'Bungalow SD': 'bungalow',
'Bungalow D': 'bungalow',
'House D': 'house',
'House SD': 'house',
'House ET': 'house',
'Bungalow MT': 'bungalow',
'House MT': 'house',
'House. MT': 'house',
'': 'unknown'
}

View file

@ -6,6 +6,7 @@ STANDARD_ROOF_CONSTRUCTIONS = {
"pitched unknown access to loft",
"piched unknown insulation",
"pitched insulated",
"pitched less than 100mm insulation"
"another dwelling above",
"flat unknown insulation",
"unknown insulated",
@ -23,5 +24,18 @@ ROOF_CONSTRUCTION_MAPPINGS = {
'2018 onwards': 'unknown',
'Pitched (vaulted ceiling)': 'pitched insulated',
np.nan: "unknown",
None: "unknown"
None: "unknown",
'Unknown': 'unknown',
'270mm': 'pitched insulated',
'300mm+': 'pitched insulated',
'100mm': 'pitched less than 100mm insulation',
'250mm': 'pitched insulated',
'300mm': 'pitched insulated',
'No Loft space': 'pitched no access to loft',
'75mm': 'pitched less than 100mm insulation',
'150mm': 'pitched insulated',
'No Loft Hatch': 'pitched unknown access to loft',
'200mm': 'pitched insulated',
'0-49mm': 'pitched less than 100mm insulation',
'50mm': 'pitched less than 100mm insulation',
}