mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
creating the asset list class
This commit is contained in:
parent
55d2df1787
commit
8432b7d202
2 changed files with 180 additions and 50 deletions
64
asset_list/AssetList.py
Normal file
64
asset_list/AssetList.py
Normal file
|
|
@ -0,0 +1,64 @@
|
|||
import os
|
||||
import pandas as pd
|
||||
|
||||
|
||||
class AssetList:
|
||||
"""
|
||||
This class is used to standardise asset lists so that we can process the core information in a consistent manner.
|
||||
"""
|
||||
|
||||
# These are the accepted methods we have for cleaning the address1 column
|
||||
ADDRESS_1_CLEANING_METHODS = [
|
||||
"first_two_words", # This method will split on the fist two words, where the separator is a space
|
||||
"first_word", # This method will split on the first word, where the separator is a space
|
||||
"house_number_extraction", # This method will use the NLP model in SearchEPC to extract the housenumber
|
||||
"address1_extraction" # This method will use the NLP model to extract address1
|
||||
]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
local_filepath,
|
||||
sheet_name,
|
||||
address1_colname,
|
||||
postcode_colname,
|
||||
full_address_colname,
|
||||
full_address_cols_to_concat=None,
|
||||
missing_postcodes_method=None,
|
||||
landlord_year_built=None,
|
||||
landlord_uprn=None,
|
||||
header=0
|
||||
):
|
||||
self.local_filepath = local_filepath
|
||||
self.sheet_name = sheet_name
|
||||
self.standardised_asset_list = None
|
||||
# Read in the data
|
||||
self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name)
|
||||
|
||||
# We detect the presence of the non-intrusive columns
|
||||
self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False
|
||||
|
||||
# Names of columns
|
||||
self.address1_colname = address1_colname
|
||||
self.postcode_colname = postcode_colname
|
||||
self.full_address_colname = full_address_colname
|
||||
self.landlord_year_built = landlord_year_built
|
||||
self.landlord_uprn = landlord_uprn
|
||||
|
||||
# parameters for cleaning
|
||||
self.full_address_cols_to_concat = full_address_cols_to_concat
|
||||
self.missing_postcodes_method = missing_postcodes_method
|
||||
|
||||
def standardise(self):
|
||||
"""
|
||||
This function is used to standardise the asset list
|
||||
:return: standardised asset list
|
||||
"""
|
||||
|
||||
# We keep just the columns we care about and will work through the various columns and standardise
|
||||
self.standardised_asset_list = self.raw_asset_list[
|
||||
[
|
||||
|
||||
]
|
||||
]
|
||||
|
||||
raise NotImplementedError
|
||||
|
|
@ -5,6 +5,7 @@ import pandas as pd
|
|||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from datetime import datetime
|
||||
from asset_list.AssetList import AssetList
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from backend.SearchEpc import SearchEpc
|
||||
|
|
@ -172,60 +173,107 @@ def extract_address1(asset_list, full_address_col, postcode_col, method="first_t
|
|||
raise ValueError(f"Method {method} not recognized")
|
||||
|
||||
|
||||
def process_age_band(x, year_built_column):
|
||||
if isinstance(x[year_built_column], datetime):
|
||||
year_built = x[year_built_column].year
|
||||
else:
|
||||
year_built = float(x[year_built_column])
|
||||
def process_age_band(asset_list, year_built_column):
|
||||
processed_age_band = []
|
||||
for _, x in asset_list.iterrows():
|
||||
|
||||
if pd.isnull(x["Property Age Band"]) or (
|
||||
x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES
|
||||
) or pd.isnull(year_built):
|
||||
return "No EPC Age Band"
|
||||
if pd.isnull(x["Property Age Band"]) or (
|
||||
x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES
|
||||
):
|
||||
processed_age_band.append({
|
||||
"row_id": x["row_id"],
|
||||
"epc_year_lower_bound": None,
|
||||
"epc_year_upper_bound": None,
|
||||
"Does Age Match EPC Age Band?": "No EPC Age Band"
|
||||
})
|
||||
continue
|
||||
|
||||
# We check if we have a numeric data
|
||||
if x["Property Age Band"].isdigit():
|
||||
if year_built == float(x["Property Age Band"]):
|
||||
return "EPC Age Band Matches Year Built"
|
||||
if year_built > float(x["Property Age Band"]):
|
||||
return "EPC Age Band is older than Year Built"
|
||||
if year_built < float(x["Property Age Band"]):
|
||||
return "EPC Age Band is newer than Year Built"
|
||||
# We exatract the upper and lower bounds
|
||||
if x["Property Age Band"] in ["England and Wales: 2007 onwards", "England and Wales: 2012 onwards"]:
|
||||
year_lower_bound = 2007 if x["Property Age Band"] == "England and Wales: 2007 onwards" else 2012
|
||||
|
||||
# Handle specific case
|
||||
if x["Property Age Band"] == "England and Wales: 2007 onwards":
|
||||
if year_built >= 2007:
|
||||
return "EPC Age Band Matches Year Built"
|
||||
if year_built < 2007:
|
||||
return "EPC Age Band is older than Year Built"
|
||||
if pd.isnull(x[year_built_column]):
|
||||
age_band_matches = "No Year Built From Landlord"
|
||||
else:
|
||||
age_band_matches = (
|
||||
"EPC Age Band Matches Year Built" if x[year_built_column] >= year_lower_bound
|
||||
else "EPC Age Band is older than Year Built"
|
||||
)
|
||||
|
||||
if x["Property Age Band"] == "England and Wales: 2012 onwards":
|
||||
if year_built >= 2012:
|
||||
return "EPC Age Band Matches Year Built"
|
||||
if year_built < 2012:
|
||||
return "EPC Age Band is older than Year Built"
|
||||
processed_age_band.append(
|
||||
{
|
||||
"row_id": x["row_id"],
|
||||
"epc_year_lower_bound": year_lower_bound,
|
||||
"epc_year_upper_bound": None,
|
||||
"Does Age Match EPC Age Band?": age_band_matches
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
if x["Property Age Band"] == "England and Wales: before 1900":
|
||||
if year_built < 1900:
|
||||
return "EPC Age Band Matches Year Built"
|
||||
if year_built >= 1900:
|
||||
return "EPC Age Band is newer than Year Built"
|
||||
if x["Property Age Band"] == "England and Wales: before 1900":
|
||||
|
||||
# Age band will be formatted as such:
|
||||
# 'England and Wales: {upper date}-{lower date}'
|
||||
# so we extract the lower and upper date
|
||||
age_band = x["Property Age Band"].split(": ")[1]
|
||||
lower_date, upper_date = age_band.split("-")
|
||||
if year_built <= float(upper_date) and year_built >= float(lower_date):
|
||||
return "EPC Age Band Matches Year Built"
|
||||
if pd.isnull(x[year_built_column]):
|
||||
age_band_matches = "No Year Built From Landlord"
|
||||
else:
|
||||
age_band_matches = (
|
||||
"EPC Age Band Matches Year Built" if x[year_built_column] < 1900
|
||||
else "EPC Age Band is newer than Year Built"
|
||||
)
|
||||
|
||||
if year_built > float(upper_date):
|
||||
return "EPC Age Band is older than Year Built"
|
||||
processed_age_band.append(
|
||||
{
|
||||
"row_id": x["row_id"],
|
||||
"epc_year_lower_bound": None,
|
||||
"epc_year_upper_bound": 1899,
|
||||
"Does Age Match EPC Age Band?": age_band_matches
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
if year_built < float(upper_date):
|
||||
return "EPC Age Band is newer than Year Built"
|
||||
if x["Property Age Band"].isdigit():
|
||||
|
||||
raise Exception("Should not reach here")
|
||||
if pd.isnull(x[year_built_column]):
|
||||
age_band_matches = "No Year Built From Landlord"
|
||||
else:
|
||||
age_band_matches = (
|
||||
"EPC Age Band Matches Year Built" if x[year_built_column] == int(x["Property Age Band"])
|
||||
else "EPC Age Band is different from Year Built"
|
||||
)
|
||||
|
||||
processed_age_band.append(
|
||||
{
|
||||
"row_id": x["row_id"],
|
||||
"epc_year_lower_bound": int(x["Property Age Band"]),
|
||||
"epc_year_upper_bound": int(x["Property Age Band"]),
|
||||
"Does Age Match EPC Age Band?": age_band_matches
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
# Oherwise, we extract the upper and lower bounds
|
||||
age_band = x["Property Age Band"].split(": ")[1]
|
||||
lower_date, upper_date = age_band.split("-")
|
||||
|
||||
age_band_matches = (
|
||||
"EPC Age Band Matches Year Built" if (x[year_built_column] >= float(lower_date)) and (
|
||||
x[year_built_column] <= float(upper_date)
|
||||
)
|
||||
else "EPC Age Band is older than Year Built" if x[year_built_column] > float(upper_date)
|
||||
else "EPC Age Band is newer than Year Built"
|
||||
)
|
||||
|
||||
processed_age_band.append(
|
||||
{
|
||||
"row_id": x["row_id"],
|
||||
"epc_year_lower_bound": int(lower_date),
|
||||
"epc_year_upper_bound": int(upper_date),
|
||||
"Does Age Match EPC Age Band?": age_band_matches
|
||||
}
|
||||
)
|
||||
|
||||
processed_age_band = pd.DataFrame(processed_age_band)
|
||||
|
||||
return processed_age_band
|
||||
|
||||
|
||||
def app():
|
||||
|
|
@ -282,16 +330,27 @@ def app():
|
|||
DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward"
|
||||
DATA_FILENAME = "WESTWARD - completed list..xlsx"
|
||||
SHEET_NAME = "Sheet1"
|
||||
|
||||
POSTCODE_COLUMN = "WFT EDIT Postcode"
|
||||
FULLADDRESS_COLUMN = "Address"
|
||||
ADDRESS1_COLUMN = None
|
||||
ADDRESS1_METHOD = "house_number_extraction"
|
||||
|
||||
ADDRESS_COLS_TO_CONCAT = []
|
||||
MISSING_POSTCODES_METHOD = None
|
||||
PROPERTY_YEAR_BUILT = "Build date"
|
||||
UPRN_COLUMN = "UPRN"
|
||||
# If we have the non-intrusives data, this should be true
|
||||
HAS_NON_INTRUSIVES = True
|
||||
PROPERTY_TYPE_COLUMN = "Location type" # This will be used to identify and remove bedsits
|
||||
|
||||
invalid_property_types_dictionary = ["bedsit", "bed-sit", "bed sit"]
|
||||
|
||||
asset_list = AssetList(
|
||||
local_filepath=os.path.join(DATA_FOLDER, DATA_FILENAME),
|
||||
header=0,
|
||||
sheet_name=SHEET_NAME
|
||||
)
|
||||
|
||||
# DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
|
||||
# DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
|
||||
|
|
@ -608,8 +667,10 @@ def app():
|
|||
# 3) If we have year in the asset list, we flag entries where the built year is different from the
|
||||
# EPC Age band
|
||||
if PROPERTY_YEAR_BUILT is not None:
|
||||
asset_list["Does Age Match EPC Age Band?"] = asset_list.apply(
|
||||
lambda x: process_age_band(x, PROPERTY_YEAR_BUILT), axis=1
|
||||
# We process the age band and merge it on
|
||||
processed_age_band = process_age_band(asset_list, PROPERTY_YEAR_BUILT)
|
||||
asset_list = asset_list.merge(
|
||||
processed_age_band, how="left", on="row_id"
|
||||
)
|
||||
|
||||
if HAS_NON_INTRUSIVES:
|
||||
|
|
@ -621,7 +682,12 @@ def app():
|
|||
(asset_list["Construction"] == "CAVITY") &
|
||||
asset_list["Insulated"].isin(["EMPTY", "PARTIAL"]) &
|
||||
(
|
||||
(asset_list[PROPERTY_YEAR_BUILT] <= 1995) # TODO, Or if the EPC age band is < 1995
|
||||
# Shold we defer to the year built provided by the HA?
|
||||
(asset_list[PROPERTY_YEAR_BUILT] <= 1995) | (asset_list["epc_year_upper_bound"] <= 1995)
|
||||
) &
|
||||
(
|
||||
# We check if the property type column contains one of the invalid property types
|
||||
~asset_list[PROPERTY_TYPE_COLUMN].str.lower().str.contains("|".join(invalid_property_types_dictionary))
|
||||
)
|
||||
)
|
||||
|
||||
|
|
@ -633,9 +699,9 @@ def app():
|
|||
(asset_list[PROPERTY_YEAR_BUILT] <= 1995)
|
||||
) &
|
||||
(
|
||||
asset_list[]
|
||||
asset_list[PROPERTY_TYPE_COLUMN]
|
||||
)
|
||||
]
|
||||
]
|
||||
|
||||
# 4) Flag properties that look like they're good candidates for solar installs
|
||||
# Firstly, flag if the fabric is completely done
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue