mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Investigating prospective changes to rdsap data
This commit is contained in:
parent
e05e8ff636
commit
e5f4e96f00
3 changed files with 206 additions and 10 deletions
|
|
@ -33,19 +33,132 @@ class DataProcessor:
|
|||
def insert_data(self, data: pd.DataFrame) -> None:
|
||||
self.data = data
|
||||
|
||||
def standardise_construction_age_band(self):
|
||||
"""
|
||||
This function will tidy up some of the non-standard values that are populated in the construction age
|
||||
band, which is useful for cleaning
|
||||
"""
|
||||
bounds_map = {
|
||||
"England and Wales: before 1900": {"l": 0, "u": 1899},
|
||||
"England and Wales: 1930-1949": {"l": 1930, "u": 1949},
|
||||
"England and Wales: 1900-1929": {"l": 1900, "u": 1929},
|
||||
"England and Wales: 1950-1966": {"l": 1950, "u": 1966},
|
||||
"England and Wales: 1967-1975": {"l": 1967, "u": 1975},
|
||||
"England and Wales: 1976-1982": {"l": 1976, "u": 1982},
|
||||
"England and Wales: 1983-1990": {"l": 1983, "u": 1990},
|
||||
"England and Wales: 1991-1995": {"l": 1991, "u": 1995},
|
||||
"England and Wales: 1996-2002": {"l": 1996, "u": 2002},
|
||||
"England and Wales: 2003-2006": {"l": 2003, "u": 2006},
|
||||
"England and Wales: 2007-2011": {"l": 2007, "u": 2011},
|
||||
"England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
|
||||
}
|
||||
|
||||
remap = {
|
||||
"England and Wales: 2007 onwards": "England and Wales: 2007-2011"
|
||||
}
|
||||
|
||||
expanded_map = {
|
||||
i: [
|
||||
label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
|
||||
][0] for i in range(0, 3001)
|
||||
}
|
||||
|
||||
def is_int(x):
|
||||
try:
|
||||
int(x)
|
||||
return True
|
||||
except:
|
||||
return False
|
||||
|
||||
def clean_construction_age_band(x):
|
||||
# Firstly, we check if it's an error value
|
||||
if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
|
||||
return x
|
||||
|
||||
# Next, we check if it's a value in our map
|
||||
if bounds_map.get(x):
|
||||
return x
|
||||
|
||||
# We check if it's a standard remap value
|
||||
remap_value = remap.get(x, None)
|
||||
if remap_value:
|
||||
return remap_value
|
||||
|
||||
# We check if it's a number
|
||||
if is_int(x):
|
||||
x_int = int(x)
|
||||
return expanded_map[x_int]
|
||||
|
||||
raise NotImplementedError("Not handled the case for value %s" % x)
|
||||
|
||||
self.data["CONSTRUCTION_AGE_BAND_CLEANED"] = self.data["CONSTRUCTION_AGE_BAND"].apply(
|
||||
lambda x: clean_construction_age_band(x)
|
||||
)
|
||||
|
||||
def clean_missing_rooms(self):
|
||||
"""
|
||||
For the number of heated rooms and number of habitable rooms, we clean these values up front,
|
||||
based on property archetype and age
|
||||
|
||||
TODO: We could use a model based impution approach for possibly more accurate cleaning
|
||||
"""
|
||||
|
||||
self.data["POSTAL_AREA"] = self.data["POSTCODE"].apply(lambda x: x.split(" ")[0])
|
||||
|
||||
def apply_clean(data, matching_columns):
|
||||
|
||||
cleaning_data = data[~pd.isnull(data[col])].groupby(
|
||||
matching_columns
|
||||
)[col].median().reset_index()
|
||||
|
||||
data = data.merge(
|
||||
cleaning_data, how="left", on=matching_columns, suffixes=("", "_CLEANING")
|
||||
)
|
||||
|
||||
data[col] = np.where(pd.isnull(data[col]), data[f"{col}_CLEANING"], data[col])
|
||||
data = data.drop(columns=f"{col}_CLEANING")
|
||||
return data
|
||||
|
||||
for col in ["NUMBER_HEATED_ROOMS", "NUMBER_HABITABLE_ROOMS"]:
|
||||
|
||||
to_index = 3
|
||||
matching_columns = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND_CLEANED", "POSTAL_AREA"]
|
||||
has_missings = pd.isnull(self.data[col]).sum()
|
||||
while has_missings:
|
||||
self.data = apply_clean(
|
||||
data=self.data,
|
||||
matching_columns=matching_columns[0:to_index + 1]
|
||||
)
|
||||
has_missings = pd.isnull(self.data[col]).sum()
|
||||
|
||||
if not has_missings or to_index == 0:
|
||||
# Check if we've gotten to index 0 and still have missings - something has gone wrong or
|
||||
# we have a very unique property type
|
||||
if has_missings:
|
||||
raise NotImplementedError("Handle this edge case, we still have missings for column %s" % col)
|
||||
|
||||
break
|
||||
to_index -= 1
|
||||
|
||||
def pre_process(self) -> pd.DataFrame:
|
||||
"""
|
||||
Load data and begin initial cleaning
|
||||
"""
|
||||
if not self.data:
|
||||
self.load_data(low_memory=DATA_PROCESSOR_SETTINGS["low_memory"])
|
||||
|
||||
self.confine_data()
|
||||
|
||||
# TODO: CLean number of heated rooms and habitable rooms
|
||||
# We have some non-standard construction age bands which we'll clean for matching
|
||||
self.standardise_construction_age_band()
|
||||
self.clean_missing_rooms()
|
||||
|
||||
self.recast_df_columns(
|
||||
column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"]
|
||||
)
|
||||
self.clean_multi_glaze_proportion()
|
||||
self.clean_photo_supply()
|
||||
|
||||
self.retain_multiple_epc_properties(
|
||||
epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"]
|
||||
)
|
||||
|
|
@ -235,8 +348,7 @@ class DataProcessor:
|
|||
|
||||
for key, values in column_mappings.items():
|
||||
if key not in self.data.columns:
|
||||
print("Column mapping incorrectly specified")
|
||||
exit(1)
|
||||
raise ValueError("Column mapping incorrectly specified")
|
||||
for value in values:
|
||||
self.data[key] = self.data[key].astype(value)
|
||||
|
||||
|
|
@ -272,6 +384,13 @@ class DataProcessor:
|
|||
) & (self.data["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
|
||||
self.data.loc[no_multi_glaze_proportion_index, "MULTI_GLAZE_PROPORTION"] = 100
|
||||
|
||||
def clean_photo_supply(self) -> None:
|
||||
"""
|
||||
We fill photo supply with zeros where it's missing
|
||||
"""
|
||||
|
||||
self.data["PHOTO_SUPPLY"] = self.data["PHOTO_SUPPLY"].fillna(0)
|
||||
|
||||
@staticmethod
|
||||
def apply_averages_cleaning(data_to_clean, cleaning_data, cols_to_merge_on):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -53,6 +53,12 @@ DEPLOYMENT_FOLDER = "deployment"
|
|||
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
|
||||
FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
|
||||
|
||||
# If we have multiple records for a numerical field, such as floor area,
|
||||
# we check the margine for error between the biggest and lowest values. If we see large
|
||||
# swings in measured values, we take the most recent value for this field as we interpret this
|
||||
# as inaccurate measurements in the past and use the most recent value
|
||||
MULTIPLE_VALUES_MARGIN_FOR_ERROR = 0.1
|
||||
|
||||
COLUMNS_TO_MERGE_ON = [
|
||||
"PROPERTY_TYPE",
|
||||
"BUILT_FORM",
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ from simulation_system.core.Settings import (
|
|||
RDSAP_RESPONSE,
|
||||
HEAT_DEMAND_RESPONSE,
|
||||
COLUMNS_TO_MERGE_ON,
|
||||
MULTIPLE_VALUES_MARGIN_FOR_ERROR,
|
||||
)
|
||||
from simulation_system.core.DataProcessor import DataProcessor
|
||||
from utils import save_dataframe_to_s3_parquet
|
||||
|
|
@ -32,10 +33,60 @@ def app():
|
|||
|
||||
dataset = []
|
||||
cleaning_dataset = []
|
||||
# 116
|
||||
# 128048706
|
||||
# PosixPath('/home/ubuntu/Documents/python/hestia/Model/model_data/simulation_system/data/all-domestic
|
||||
# -certificates/domestic-E09000021-Kingston-upon-Thames')
|
||||
|
||||
# TODO: Does energy tariff make a difference
|
||||
# TODO: If SAP hasn't changed, we don't include the record
|
||||
# TODO: Floor area will impact the EPC so instead of averaging, we should have a starting and ending value.
|
||||
# TODO: Same as floor area for floor height
|
||||
# TODO: If fundamental building fabric changes, we should proabably discard the record
|
||||
# TODO: Should we prune records that have an exceptionally large amount of time between them?
|
||||
# TODO: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections?
|
||||
#
|
||||
# TODO: REMOVE ME
|
||||
dodgy_uprns = []
|
||||
observed_uprns = [
|
||||
"10002082244", # Doesn't really make sense, house no longer has lel and not has more insulation but lower score
|
||||
"10002082259",
|
||||
# Property has more roof insulation, lel, but now the floor isn't insulated and has a lower score. Also the
|
||||
# floor assessment is now assumed whereas before it wasnt
|
||||
"10002082418", # Walls went from insulated to not...
|
||||
"10002082640", # Property identical besides different energy taffiff
|
||||
"10002082830", # Lots of records going from not insulated to insulated but some parts of
|
||||
# the property has gotten better
|
||||
"10002083244", # latest epc indicates the property is worse
|
||||
"10002083592", # lastest epc doesn't have a fuel system present, but has slightly more insulation. Also the
|
||||
# floor type has changed from solid to syspended. lel has decreased
|
||||
"100030533576", # property slightly worse, has less lels and the floor description has changed type
|
||||
"100030533668", # has slightly less lels. Glazed type is now missing
|
||||
"100030533803", # Not super clea why this is lower, newer epc has more lel but is using second heating
|
||||
"100030534016", # Property has less lel but more roof insulation. Floor type has changed
|
||||
"100030534040", # property has less lel and the floor type has changed
|
||||
"100030534041", # property has less insulation and less lel
|
||||
"100030534243", # Cavity wall has gone from filled to unfilled
|
||||
"100030534294", # less roof insulation but now has an air source heat pump
|
||||
"100030534322", # identical between records but now with higher lel but no change recorded
|
||||
"100030534413", # identical between records but different energy tariff, no sap change
|
||||
"100030534437", # property has less lel and the mainheating no longer has a programmer and trvs
|
||||
"100030534569", # Cavity wall no longer filled, 30mm more roof insulation in newest epc
|
||||
"100030534676", # Property has less lel, is now using secondary heating, has 50mm less roof insulation, but
|
||||
# the wall cavity is no longer filled
|
||||
"100030534732", # property has higher lel %. Not clear why this is worse, glazing type has changed.
|
||||
# This looks dodgy has the UPRN_SOURCE is address matched also the floor area has increased from the first to
|
||||
# the later epc
|
||||
"100030534791", # Property has started using secondary heating - the EPCs are taken on the same day so maybe we
|
||||
# should discard
|
||||
"100030534795", # More lel but a lot less insulation. This is a very dodgy record, sap has gone from 90 to 66
|
||||
# The newer epc indicates the property now has 40% photo supply so this doesn't make much sense
|
||||
"100030534897", # Roof has gone from thatched with additional insulation to pitched with insulation,
|
||||
# sap score hasn't changed
|
||||
"100030534986", # Property has gone from 300mm loft insulation to none. has 2% higher lel (negligible) and
|
||||
# slightly better main heating setup
|
||||
"100030535043", # Property lel increased by 12%, not clear why sap worse. Maybe due to different floor area and
|
||||
# wall height
|
||||
"100030535173", # lel increased from 20% to 80% but roof gone from 100m insulation to "limited" insulation
|
||||
"100030535244", # lel gone from 100% to 0%, sap is the same
|
||||
]
|
||||
|
||||
for directory in tqdm(directories):
|
||||
|
||||
filepath = directory / "certificates.csv"
|
||||
|
|
@ -74,9 +125,9 @@ def app():
|
|||
|
||||
vals = list(modified_property_data[field].dropna().unique())
|
||||
if len(vals) > 1:
|
||||
# Check the values are too far apart
|
||||
# TODO: we could have multiple values here, why only use the first two?
|
||||
if abs(vals[0] - vals[1]) / vals[0] > 0.1:
|
||||
lowest_value = min(vals)
|
||||
largest_value = max(vals)
|
||||
if abs(largest_value - lowest_value) / lowest_value > MULTIPLE_VALUES_MARGIN_FOR_ERROR:
|
||||
# Take the more recent value since it's likely to be more accurate
|
||||
vals = [vals[-1]]
|
||||
|
||||
|
|
@ -111,6 +162,26 @@ def app():
|
|||
- starting_record[HEAT_DEMAND_RESPONSE]
|
||||
)
|
||||
|
||||
# Check for a change in the starting and ending record
|
||||
check_cols = [
|
||||
col for col in starting_record.index if col not in [
|
||||
"LODGEMENT_DATE", "CURRENT_ENERGY_EFFICIENCY", "ENERGY_CONSUMPTION_CURRENT", "ENERGY_TARIFF"
|
||||
]
|
||||
]
|
||||
all_same = True
|
||||
for col in check_cols:
|
||||
if starting_record[col] != ending_record[col]:
|
||||
all_same = False
|
||||
break
|
||||
|
||||
if rdsap_change <= 0:
|
||||
if all_same | (uprn in observed_uprns):
|
||||
if uprn not in observed_uprns:
|
||||
dodgy_uprns.append(uprn)
|
||||
else:
|
||||
compare = pd.concat([starting_record, ending_record], axis=1)
|
||||
bljd
|
||||
|
||||
# TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and
|
||||
# floors, we may want to use the U-value. We may also want to handle the (assumed) tags
|
||||
# within descriptions
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue