Investigating prospective changes to rdsap data

This commit is contained in:
Khalim Conn-Kowlessar 2023-09-06 16:47:18 +01:00
parent e05e8ff636
commit e5f4e96f00
3 changed files with 206 additions and 10 deletions

View file

@ -33,19 +33,132 @@ class DataProcessor:
def insert_data(self, data: pd.DataFrame) -> None:
self.data = data
def standardise_construction_age_band(self):
"""
This function will tidy up some of the non-standard values that are populated in the construction age
band, which is useful for cleaning
"""
bounds_map = {
"England and Wales: before 1900": {"l": 0, "u": 1899},
"England and Wales: 1930-1949": {"l": 1930, "u": 1949},
"England and Wales: 1900-1929": {"l": 1900, "u": 1929},
"England and Wales: 1950-1966": {"l": 1950, "u": 1966},
"England and Wales: 1967-1975": {"l": 1967, "u": 1975},
"England and Wales: 1976-1982": {"l": 1976, "u": 1982},
"England and Wales: 1983-1990": {"l": 1983, "u": 1990},
"England and Wales: 1991-1995": {"l": 1991, "u": 1995},
"England and Wales: 1996-2002": {"l": 1996, "u": 2002},
"England and Wales: 2003-2006": {"l": 2003, "u": 2006},
"England and Wales: 2007-2011": {"l": 2007, "u": 2011},
"England and Wales: 2012 onwards": {"l": 2012, "u": 3000},
}
remap = {
"England and Wales: 2007 onwards": "England and Wales: 2007-2011"
}
expanded_map = {
i: [
label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l'])
][0] for i in range(0, 3001)
}
def is_int(x):
try:
int(x)
return True
except:
return False
def clean_construction_age_band(x):
# Firstly, we check if it's an error value
if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]:
return x
# Next, we check if it's a value in our map
if bounds_map.get(x):
return x
# We check if it's a standard remap value
remap_value = remap.get(x, None)
if remap_value:
return remap_value
# We check if it's a number
if is_int(x):
x_int = int(x)
return expanded_map[x_int]
raise NotImplementedError("Not handled the case for value %s" % x)
self.data["CONSTRUCTION_AGE_BAND_CLEANED"] = self.data["CONSTRUCTION_AGE_BAND"].apply(
lambda x: clean_construction_age_band(x)
)
def clean_missing_rooms(self):
"""
For the number of heated rooms and number of habitable rooms, we clean these values up front,
based on property archetype and age
TODO: We could use a model based impution approach for possibly more accurate cleaning
"""
self.data["POSTAL_AREA"] = self.data["POSTCODE"].apply(lambda x: x.split(" ")[0])
def apply_clean(data, matching_columns):
cleaning_data = data[~pd.isnull(data[col])].groupby(
matching_columns
)[col].median().reset_index()
data = data.merge(
cleaning_data, how="left", on=matching_columns, suffixes=("", "_CLEANING")
)
data[col] = np.where(pd.isnull(data[col]), data[f"{col}_CLEANING"], data[col])
data = data.drop(columns=f"{col}_CLEANING")
return data
for col in ["NUMBER_HEATED_ROOMS", "NUMBER_HABITABLE_ROOMS"]:
to_index = 3
matching_columns = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND_CLEANED", "POSTAL_AREA"]
has_missings = pd.isnull(self.data[col]).sum()
while has_missings:
self.data = apply_clean(
data=self.data,
matching_columns=matching_columns[0:to_index + 1]
)
has_missings = pd.isnull(self.data[col]).sum()
if not has_missings or to_index == 0:
# Check if we've gotten to index 0 and still have missings - something has gone wrong or
# we have a very unique property type
if has_missings:
raise NotImplementedError("Handle this edge case, we still have missings for column %s" % col)
break
to_index -= 1
def pre_process(self) -> pd.DataFrame:
"""
Load data and begin initial cleaning
"""
if not self.data:
self.load_data(low_memory=DATA_PROCESSOR_SETTINGS["low_memory"])
self.confine_data()
# TODO: CLean number of heated rooms and habitable rooms
# We have some non-standard construction age bands which we'll clean for matching
self.standardise_construction_age_band()
self.clean_missing_rooms()
self.recast_df_columns(
column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"]
)
self.clean_multi_glaze_proportion()
self.clean_photo_supply()
self.retain_multiple_epc_properties(
epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"]
)
@ -235,8 +348,7 @@ class DataProcessor:
for key, values in column_mappings.items():
if key not in self.data.columns:
print("Column mapping incorrectly specified")
exit(1)
raise ValueError("Column mapping incorrectly specified")
for value in values:
self.data[key] = self.data[key].astype(value)
@ -272,6 +384,13 @@ class DataProcessor:
) & (self.data["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
self.data.loc[no_multi_glaze_proportion_index, "MULTI_GLAZE_PROPORTION"] = 100
def clean_photo_supply(self) -> None:
"""
We fill photo supply with zeros where it's missing
"""
self.data["PHOTO_SUPPLY"] = self.data["PHOTO_SUPPLY"].fillna(0)
@staticmethod
def apply_averages_cleaning(data_to_clean, cleaning_data, cols_to_merge_on):
"""

View file

@ -53,6 +53,12 @@ DEPLOYMENT_FOLDER = "deployment"
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
# If we have multiple records for a numerical field, such as floor area,
# we check the margine for error between the biggest and lowest values. If we see large
# swings in measured values, we take the most recent value for this field as we interpret this
# as inaccurate measurements in the past and use the most recent value
MULTIPLE_VALUES_MARGIN_FOR_ERROR = 0.1
COLUMNS_TO_MERGE_ON = [
"PROPERTY_TYPE",
"BUILT_FORM",

View file

@ -11,6 +11,7 @@ from simulation_system.core.Settings import (
RDSAP_RESPONSE,
HEAT_DEMAND_RESPONSE,
COLUMNS_TO_MERGE_ON,
MULTIPLE_VALUES_MARGIN_FOR_ERROR,
)
from simulation_system.core.DataProcessor import DataProcessor
from utils import save_dataframe_to_s3_parquet
@ -32,10 +33,60 @@ def app():
dataset = []
cleaning_dataset = []
# 116
# 128048706
# PosixPath('/home/ubuntu/Documents/python/hestia/Model/model_data/simulation_system/data/all-domestic
# -certificates/domestic-E09000021-Kingston-upon-Thames')
# TODO: Does energy tariff make a difference
# TODO: If SAP hasn't changed, we don't include the record
# TODO: Floor area will impact the EPC so instead of averaging, we should have a starting and ending value.
# TODO: Same as floor area for floor height
# TODO: If fundamental building fabric changes, we should proabably discard the record
# TODO: Should we prune records that have an exceptionally large amount of time between them?
# TODO: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections?
#
# TODO: REMOVE ME
dodgy_uprns = []
observed_uprns = [
"10002082244", # Doesn't really make sense, house no longer has lel and not has more insulation but lower score
"10002082259",
# Property has more roof insulation, lel, but now the floor isn't insulated and has a lower score. Also the
# floor assessment is now assumed whereas before it wasnt
"10002082418", # Walls went from insulated to not...
"10002082640", # Property identical besides different energy taffiff
"10002082830", # Lots of records going from not insulated to insulated but some parts of
# the property has gotten better
"10002083244", # latest epc indicates the property is worse
"10002083592", # lastest epc doesn't have a fuel system present, but has slightly more insulation. Also the
# floor type has changed from solid to syspended. lel has decreased
"100030533576", # property slightly worse, has less lels and the floor description has changed type
"100030533668", # has slightly less lels. Glazed type is now missing
"100030533803", # Not super clea why this is lower, newer epc has more lel but is using second heating
"100030534016", # Property has less lel but more roof insulation. Floor type has changed
"100030534040", # property has less lel and the floor type has changed
"100030534041", # property has less insulation and less lel
"100030534243", # Cavity wall has gone from filled to unfilled
"100030534294", # less roof insulation but now has an air source heat pump
"100030534322", # identical between records but now with higher lel but no change recorded
"100030534413", # identical between records but different energy tariff, no sap change
"100030534437", # property has less lel and the mainheating no longer has a programmer and trvs
"100030534569", # Cavity wall no longer filled, 30mm more roof insulation in newest epc
"100030534676", # Property has less lel, is now using secondary heating, has 50mm less roof insulation, but
# the wall cavity is no longer filled
"100030534732", # property has higher lel %. Not clear why this is worse, glazing type has changed.
# This looks dodgy has the UPRN_SOURCE is address matched also the floor area has increased from the first to
# the later epc
"100030534791", # Property has started using secondary heating - the EPCs are taken on the same day so maybe we
# should discard
"100030534795", # More lel but a lot less insulation. This is a very dodgy record, sap has gone from 90 to 66
# The newer epc indicates the property now has 40% photo supply so this doesn't make much sense
"100030534897", # Roof has gone from thatched with additional insulation to pitched with insulation,
# sap score hasn't changed
"100030534986", # Property has gone from 300mm loft insulation to none. has 2% higher lel (negligible) and
# slightly better main heating setup
"100030535043", # Property lel increased by 12%, not clear why sap worse. Maybe due to different floor area and
# wall height
"100030535173", # lel increased from 20% to 80% but roof gone from 100m insulation to "limited" insulation
"100030535244", # lel gone from 100% to 0%, sap is the same
]
for directory in tqdm(directories):
filepath = directory / "certificates.csv"
@ -74,9 +125,9 @@ def app():
vals = list(modified_property_data[field].dropna().unique())
if len(vals) > 1:
# Check the values are too far apart
# TODO: we could have multiple values here, why only use the first two?
if abs(vals[0] - vals[1]) / vals[0] > 0.1:
lowest_value = min(vals)
largest_value = max(vals)
if abs(largest_value - lowest_value) / lowest_value > MULTIPLE_VALUES_MARGIN_FOR_ERROR:
# Take the more recent value since it's likely to be more accurate
vals = [vals[-1]]
@ -111,6 +162,26 @@ def app():
- starting_record[HEAT_DEMAND_RESPONSE]
)
# Check for a change in the starting and ending record
check_cols = [
col for col in starting_record.index if col not in [
"LODGEMENT_DATE", "CURRENT_ENERGY_EFFICIENCY", "ENERGY_CONSUMPTION_CURRENT", "ENERGY_TARIFF"
]
]
all_same = True
for col in check_cols:
if starting_record[col] != ending_record[col]:
all_same = False
break
if rdsap_change <= 0:
if all_same | (uprn in observed_uprns):
if uprn not in observed_uprns:
dodgy_uprns.append(uprn)
else:
compare = pd.concat([starting_record, ending_record], axis=1)
bljd
# TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and
# floors, we may want to use the U-value. We may also want to handle the (assumed) tags
# within descriptions