This commit is contained in:
Khalim Conn-Kowlessar 2023-08-15 17:33:52 +01:00
commit eccf4814b9
2 changed files with 12 additions and 14 deletions

View file

@ -27,6 +27,8 @@ class DataProcessor:
"""
self.load_data(low_memory=DATA_PROCESSOR_SETTINGS['low_memory'])
self.confine_data()
# TODO: CLean number of heated rooms and habitable rooms
self.recast_df_columns(column_mappings=DATA_PROCESSOR_SETTINGS['column_mappings'])
self.clean_multi_glaze_proportion()
self.retain_multiple_epc_properties(epc_minimum_count=DATA_PROCESSOR_SETTINGS['epc_minimum_count'])
@ -39,10 +41,11 @@ class DataProcessor:
# Define a custom function to calculate the median, excluding missing values
def median_without_missing(group):
return group[AVERAGE_FIXED_FEATURES].median(skipna=True)
cleaning_averages = self.data.groupby(
["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
observed=True
observed=True,
dropna=False
).apply(median_without_missing).reset_index()
general_averages = self.data.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True).apply(

View file

@ -42,8 +42,8 @@ def app():
# Fixed features - these are property attributes that shouldn't change over time
fixed_data = {}
# If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
if max(modified_property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1:
# If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
if max(property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1:
continue
# Map all anomaly values to None
@ -70,7 +70,7 @@ def app():
columns_to_merge_on = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS",
"NUMBER_HEATED_ROOMS"]
if any(modified_property_data[columns_to_merge_on].isna()):
if modified_property_data[columns_to_merge_on].isna().values.any():
# If there are any NA value, back fill first (i.e most recent), then forward fill if needed
modified_property_data[columns_to_merge_on] = modified_property_data[columns_to_merge_on].fillna(method='bfill').fillna(method='ffill')
@ -80,12 +80,14 @@ def app():
# Get the corresponding groupby and merge, and fill in NA values
cleaning_averages_to_merge = cleaning_averages.groupby(columns_to_merge_on)[['TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT']].mean()
modified_property_data = pd.merge(modified_property_data, cleaning_averages_to_merge, on=columns_to_merge_on, suffixes=['', '_AVERAGE'])
modified_property_data['TOTAL_FLOOR_AREA'] = modified_property_data['TOTAL_FLOOR_AREA'].fillna(modified_property_data['TOTAL_FLOOR_AREA_AVERAGE'])
modified_property_data['FLOOR_HEIGHT'] = modified_property_data['FLOOR_HEIGHT'].fillna(modified_property_data['FLOOR_HEIGHT_AVERAGE'])
modified_property_data = modified_property_data.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE'])
for field in AVERAGE_FIXED_FEATURES:
vals = list(modified_property_data[field].dropna().unique())
if len(vals) > 1:
# Check the values are too far apart
@ -93,11 +95,9 @@ def app():
if abs(vals[0] - vals[1]) / vals[0] > 0.1:
# Take the more recent value since it's likely to be more accurate
vals = [vals[-1]]
if vals:
field_value = np.mean(vals)
fixed_data[field] = field_value
fixed_data[field] = np.mean(vals)
#Combine all fields together
fixed_data.update(mandatory_field_data)
@ -122,11 +122,6 @@ def app():
rdsap_change = ending_record[RDSAP_RESPONSE] - starting_record[RDSAP_RESPONSE]
heat_demand_change = ending_record[HEAT_DEMAND_RESPONSE] - starting_record[HEAT_DEMAND_RESPONSE]
# TODO: Should this be <= 0?
if rdsap_change == 0:
# Assumption: We aren't interested in records that exhibit no change
continue
# TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and
# floors, we may want to use the U-value. We may also want to handle the (assumed) tags
# within descriptions