mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
working on unitas data standardisation
This commit is contained in:
parent
96fb10390b
commit
5848cb5314
6 changed files with 168 additions and 57 deletions
|
|
@ -5,6 +5,7 @@ import tiktoken
|
|||
from pprint import pprint
|
||||
from datetime import datetime
|
||||
|
||||
from numpy.ma.core import masked_not_equal
|
||||
from openai import OpenAI
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
|
@ -2179,7 +2180,7 @@ class AssetList:
|
|||
return
|
||||
|
||||
# TODO: Fetch from Sharepoint
|
||||
ecosurv_filepath = "/Users/khalimconn-kowlessar/Documents/hestia/Ecosurv/15.04.csv"
|
||||
ecosurv_filepath = "/Users/khalimconn-kowlessar/Documents/hestia/Ecosurv/07.05.2025.csv"
|
||||
logger.info("Getting Ecosurv data from %s", ecosurv_filepath)
|
||||
self.ecosurv = pd.read_csv(ecosurv_filepath, encoding="cp437")
|
||||
|
||||
|
|
@ -2310,7 +2311,7 @@ class AssetList:
|
|||
nomatch_i = []
|
||||
for _, x in tqdm(outcomes.iterrows(), total=len(outcomes)):
|
||||
|
||||
if pd.isnull(x[outcomes_address[idx]]):
|
||||
if pd.isnull(x[outcomes_address[idx]]) or not x[outcomes_address[idx]]:
|
||||
continue
|
||||
|
||||
# Check if we have an id
|
||||
|
|
@ -2448,7 +2449,7 @@ class AssetList:
|
|||
return pd.to_datetime(match.group(1), format="%d.%m.%Y", errors="coerce")
|
||||
return pd.NaT
|
||||
|
||||
lookup['parsed_date'] = lookup['Date letters sent'].apply(extract_date)
|
||||
lookup['parsed_date'] = lookup[date_col].apply(extract_date)
|
||||
|
||||
def get_latest_note(group):
|
||||
surveyed = group[group['Outcome'] == 'surveyed']
|
||||
|
|
@ -2457,8 +2458,11 @@ class AssetList:
|
|||
else:
|
||||
return group.sort_values('parsed_date', ascending=False).iloc[0]
|
||||
|
||||
latest_note = lookup.groupby('domna_property_id', group_keys=False).apply(get_latest_note).reset_index(
|
||||
drop=True)
|
||||
latest_note = (
|
||||
lookup.groupby('domna_property_id', group_keys=False).
|
||||
apply(get_latest_note).
|
||||
reset_index(drop=True)
|
||||
)
|
||||
latest_note = latest_note[["domna_property_id", notes_col]]
|
||||
|
||||
pivot_df = lookup.groupby(["domna_property_id", "Outcome"]).size().unstack(fill_value=0).reset_index()
|
||||
|
|
@ -2513,36 +2517,43 @@ class AssetList:
|
|||
# Strip columns
|
||||
master_data.columns = [c.strip() for c in master_data.columns]
|
||||
master_data.columns = [re.sub(r'\s+', ' ', c) for c in master_data.columns]
|
||||
# Drop any unnamed columns
|
||||
unnamed_columns = [c for c in master_data.columns if "Unnamed:" in c]
|
||||
master_data = master_data.drop(columns=unnamed_columns)
|
||||
|
||||
if not id_map.empty:
|
||||
master_data = master_data.merge(
|
||||
id_map, how="left", on=['NO.', 'Street / Block Name', 'Post Code']
|
||||
)
|
||||
|
||||
install_col = (
|
||||
"INSTALLED OR CANCELLED" if "INSTALLED OR CANCELLED" in master_data.columns
|
||||
else "INSTALL / CANCELLATION DATE"
|
||||
)
|
||||
if "INSTALLED OR CANCELLED" in master_data.columns:
|
||||
install_col = "INSTALLED OR CANCELLED"
|
||||
elif "INSTALL / CANCELLATION DATE" in master_data.columns:
|
||||
install_col = "INSTALL / CANCELLATION DATE"
|
||||
elif 'INSTALL/ CANCELLATION DATE' in master_data.columns:
|
||||
install_col = 'INSTALL/ CANCELLATION DATE'
|
||||
else:
|
||||
raise ValueError("No install or cancellation date")
|
||||
|
||||
submission_col = (
|
||||
"SUBMISSION DATE" if "SUBMISSION DATE" in master_data.columns else "SUBMISSION DATE TO INSTALLERS"
|
||||
)
|
||||
|
||||
if "UPRN" in master_data.columns:
|
||||
# We just need to check if any were cancelled
|
||||
master_to_append = master_data[
|
||||
["UPRN", install_col, submission_col]
|
||||
].rename(
|
||||
columns={
|
||||
"UPRN": self.STANDARD_LANDLORD_PROPERTY_ID,
|
||||
install_col: "survey_status",
|
||||
submission_col: "submission_date"
|
||||
}
|
||||
)
|
||||
master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel")
|
||||
|
||||
master_surveyed.append(master_to_append)
|
||||
continue
|
||||
# if "UPRN" in master_data.columns:
|
||||
# # We just need to check if any were cancelled
|
||||
# master_to_append = master_data[
|
||||
# ["UPRN", install_col, submission_col]
|
||||
# ].rename(
|
||||
# columns={
|
||||
# "UPRN": self.STANDARD_LANDLORD_PROPERTY_ID,
|
||||
# install_col: "survey_status",
|
||||
# submission_col: "submission_date"
|
||||
# }
|
||||
# )
|
||||
# master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel")
|
||||
#
|
||||
# master_surveyed.append(master_to_append)
|
||||
# continue
|
||||
|
||||
master_data["row_id"] = master_data.index
|
||||
|
||||
|
|
@ -2557,23 +2568,35 @@ class AssetList:
|
|||
house_no_col = 'NO.' if 'NO.' in master_data.columns else "NO"
|
||||
|
||||
# Otherwise, we need to match algorithmically
|
||||
has_property_id = "UPRN" in master_data.columns
|
||||
logger.info("Matching master data to asset list")
|
||||
matched = []
|
||||
unmatched = []
|
||||
for _, row in tqdm(master_data.iterrows(), total=len(master_data)):
|
||||
|
||||
if pd.isnull(row[postcode_col]):
|
||||
continue
|
||||
|
||||
# if has_property_id:
|
||||
# submission_uprn = row["UPRN"]
|
||||
#
|
||||
# if not pd.isnull(submission_uprn):
|
||||
# df = self.standardised_asset_list[
|
||||
# self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID] == submission_uprn
|
||||
# ]
|
||||
|
||||
postcode_no_space = row[postcode_col].strip().replace(" ", "").lower()
|
||||
|
||||
df = self.standardised_asset_list[
|
||||
(
|
||||
self.standardised_asset_list[self.STANDARD_POSTCODE].str.strip().str.lower().str.replace(" ",
|
||||
"")
|
||||
== postcode_no_space
|
||||
self.standardised_asset_list[self.STANDARD_POSTCODE]
|
||||
.str.strip().str.lower().str.replace(" ", "") == postcode_no_space
|
||||
)
|
||||
]
|
||||
|
||||
house_no = row[house_no_col]
|
||||
if isinstance(house_no, float):
|
||||
house_no = str(int(house_no))
|
||||
|
||||
if house_no in df["house_no"].values:
|
||||
df = df[df["house_no"] == house_no]
|
||||
|
|
|
|||
|
|
@ -62,32 +62,42 @@ def app():
|
|||
Property UPRN
|
||||
"""
|
||||
|
||||
# TODO:
|
||||
# For cavity work:
|
||||
# - Flag any entries that have a different wall type between non-intrusive data against EPC
|
||||
# - Worth double checking entries that have a difference in wall construction
|
||||
# - Look at anything that is flagged as an empty cavity but the EPC data says it’s a filled cavity
|
||||
# - Look at the current EPC scores - Anything that is C75 or above, especially if it’s assumed no insulation
|
||||
# - By postcode, we can try and deduce if all of the addresses are a flats and then estimate if 50% of the flats
|
||||
# are less than C75
|
||||
# - Flag anything pre SAP2012
|
||||
# - Flag anything over 5 years old
|
||||
# - Look at year built vs age band
|
||||
#
|
||||
# For Solar:
|
||||
# - Discount any that have solar PV - based on non-intrusives and from the inspections team
|
||||
# - In the heating, discount anything that isn’t ashp, ghsp, hhrs, electric storage - possibly homes with
|
||||
# electric room heaters but it might need to be an EPC E
|
||||
# - Fabric - check the floor, wall and roof:
|
||||
# - Filled or empty cavity is good
|
||||
# - Insulated solid/timber/system built is good
|
||||
# - SCIS/CEG needs solid floors
|
||||
# - JJC don’t care
|
||||
# - Anything with a loft 200 or below
|
||||
# - Anything C75 and above won’t qualify
|
||||
# - Insulated loft = 200mm
|
||||
# - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
|
||||
# - Or the insulation required is loft/cavity (floors should be solid)
|
||||
# Unitas
|
||||
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unitas"
|
||||
data_filename = "UNITAS - Asset List.xlsx"
|
||||
sheet_name = "Asset List"
|
||||
postcode_column = 'Post Code'
|
||||
fulladdress_column = "Address Line 1"
|
||||
address1_column = "Address Line 1"
|
||||
address1_method = None
|
||||
address_cols_to_concat = []
|
||||
missing_postcodes_method = None
|
||||
landlord_year_built = "built year"
|
||||
landlord_os_uprn = None
|
||||
landlord_property_type = "Property Type"
|
||||
landlord_built_form = "Expanded Property Type"
|
||||
landlord_wall_construction = None
|
||||
landlord_roof_construction = "loft insulation"
|
||||
landlord_heating_system = "Bolier Make"
|
||||
landlord_existing_pv = None
|
||||
landlord_property_id = "Property Reference"
|
||||
landlord_sap = "Sap Rating"
|
||||
outcomes_filename = [
|
||||
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unitas/Unitas - All outcomes - 24.04.2025.xlsx",
|
||||
]
|
||||
outcomes_sheetname = ["Feedback"]
|
||||
outcomes_postcode = ["Postcode"]
|
||||
outcomes_houseno = ["No."]
|
||||
outcomes_id = [None]
|
||||
outcomes_address = ["Address"]
|
||||
master_filepaths = [
|
||||
os.path.join(data_folder, "Submissions ECO 3.csv"),
|
||||
os.path.join(data_folder, "Submissions ECO 4 - PHASE 1.csv"),
|
||||
os.path.join(data_folder, "Submissions ECO 4 - PHASE 2.csv")
|
||||
]
|
||||
master_to_asset_list_filepath = None
|
||||
phase = False
|
||||
ecosurv_landlords = "unitas|everill|baskeyfield"
|
||||
|
||||
# LHP:
|
||||
data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/LHP"
|
||||
|
|
|
|||
|
|
@ -212,5 +212,43 @@ BUILT_FORM_MAPPINGS = {
|
|||
'5 Ext. Wall Flat': 'unknown',
|
||||
'Unknown': 'unknown',
|
||||
'Enclosed mid-terrace': 'mid-terrace',
|
||||
'Enclosed end-terrace': 'end-terrace'
|
||||
'Enclosed end-terrace': 'end-terrace',
|
||||
|
||||
'House GROUND FLOOR': 'ground floor',
|
||||
'Flat? GROUND FLOOR': 'ground floor',
|
||||
'House SD SEMI DETACHED': 'semi-detached',
|
||||
'House SEMI DETACHED': 'semi-detached',
|
||||
'Flat GROUND FLOOR': 'ground floor',
|
||||
'': 'unknown',
|
||||
'Flat SEVENTH FLOOR': 'mid-floor',
|
||||
'House D': 'detached',
|
||||
'House ET': 'end-terrace',
|
||||
'House SD Homeless Unit': 'semi-detached',
|
||||
'House MT Homeless Unit': 'mid-terrace',
|
||||
'Bungalow ET': 'end-terrace',
|
||||
'Bungalow D': 'detached',
|
||||
'House SD': 'semi-detached',
|
||||
'Bungalow Sheltered Accomodation': 'unknown',
|
||||
'House. SD': 'semi-detached',
|
||||
'Flat FIRST FLOOR MAISONETTE': 'ground floor',
|
||||
'Bungalow SD': 'semi-detached',
|
||||
'Flat FIRST FLOOR': 'ground floor',
|
||||
'Flat Sheltered Accomodation': 'unknown',
|
||||
'Flat SIXTH FLOOR': 'mid-floor',
|
||||
'Flat EIGHTH FLOOR': 'mid-floor',
|
||||
'Flat FOURTH FLOOR': 'mid-floor',
|
||||
'Flat Homeless Unit': 'unknown',
|
||||
'Bungalow MT': 'mid-terrace',
|
||||
'Bungalow Homeless Unit': 'unknown',
|
||||
'House MT': 'mid-terrace',
|
||||
'Flat FIFTH FLOOR': 'mid-floor',
|
||||
'Flat NINTH FLOOR': 'mid-floor',
|
||||
'House SD FIRST FLOOR': 'semi-detached',
|
||||
'Bungalow Supported housing': 'unknown',
|
||||
'Flat THIRD FLOOR': 'mid-floor',
|
||||
'Flat SECOND FLOOR': 'mid-floor',
|
||||
'House Homeless Unit': 'unknown',
|
||||
'Flat ELEVENTH FLOOR': 'mid-floor',
|
||||
'Flat TENTH FLOOR': 'mid-floor',
|
||||
'House. MT': 'mid-terrace'
|
||||
}
|
||||
|
|
|
|||
|
|
@ -263,5 +263,18 @@ HEATING_MAPPINGS = {
|
|||
'Oil Standard Boiler Heating': 'oil boiler',
|
||||
'Oil Condensing Boiler Heating': 'oil boiler',
|
||||
'Electric ASHP': 'air source heat pump',
|
||||
'Modern Slimline Storage Heaters': 'electric storage heaters'
|
||||
'Modern Slimline Storage Heaters': 'electric storage heaters',
|
||||
# These are boiler makes from Unitas
|
||||
'UNKNOWN': 'unknown',
|
||||
'IDEAL': 'gas combi boiler',
|
||||
'VAILLANT': 'gas combi boiler',
|
||||
'THORN': 'gas combi boiler',
|
||||
'WORCESTER': 'gas combi boiler',
|
||||
'GLOW WORM': 'gas combi boiler',
|
||||
'VOKERA': 'gas combi boiler',
|
||||
'POTTERTON': 'gas combi boiler',
|
||||
'BAXI SOLO': 'gas combi boiler',
|
||||
'BAXI BERMUDA': 'gas combi boiler',
|
||||
'BAXI': 'gas combi boiler'
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -206,5 +206,18 @@ PROPERTY_MAPPING = {
|
|||
'02 FLAT': 'flat',
|
||||
'04 MAISONETTE': 'maisonette',
|
||||
'01 HOUSE MID': 'house',
|
||||
'03 BUNGALOW': 'bungalow'
|
||||
'03 BUNGALOW': 'bungalow',
|
||||
'Flat?': 'flat',
|
||||
'Bungalow ET': 'bungalow',
|
||||
'House. SD': 'house',
|
||||
'Bungalow SD': 'bungalow',
|
||||
'Bungalow D': 'bungalow',
|
||||
'House D': 'house',
|
||||
'House SD': 'house',
|
||||
'House ET': 'house',
|
||||
'Bungalow MT': 'bungalow',
|
||||
'House MT': 'house',
|
||||
'House. MT': 'house',
|
||||
'': 'unknown'
|
||||
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,6 +6,7 @@ STANDARD_ROOF_CONSTRUCTIONS = {
|
|||
"pitched unknown access to loft",
|
||||
"piched unknown insulation",
|
||||
"pitched insulated",
|
||||
"pitched less than 100mm insulation"
|
||||
"another dwelling above",
|
||||
"flat unknown insulation",
|
||||
"unknown insulated",
|
||||
|
|
@ -23,5 +24,18 @@ ROOF_CONSTRUCTION_MAPPINGS = {
|
|||
'2018 onwards': 'unknown',
|
||||
'Pitched (vaulted ceiling)': 'pitched insulated',
|
||||
np.nan: "unknown",
|
||||
None: "unknown"
|
||||
None: "unknown",
|
||||
'Unknown': 'unknown',
|
||||
'270mm': 'pitched insulated',
|
||||
'300mm+': 'pitched insulated',
|
||||
'100mm': 'pitched less than 100mm insulation',
|
||||
'250mm': 'pitched insulated',
|
||||
'300mm': 'pitched insulated',
|
||||
'No Loft space': 'pitched no access to loft',
|
||||
'75mm': 'pitched less than 100mm insulation',
|
||||
'150mm': 'pitched insulated',
|
||||
'No Loft Hatch': 'pitched unknown access to loft',
|
||||
'200mm': 'pitched insulated',
|
||||
'0-49mm': 'pitched less than 100mm insulation',
|
||||
'50mm': 'pitched less than 100mm insulation',
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue