mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Merge branch 'main' of https://github.com/Hestia-Homes/Model
This commit is contained in:
commit
eccf4814b9
2 changed files with 12 additions and 14 deletions
|
|
@ -27,6 +27,8 @@ class DataProcessor:
|
|||
"""
|
||||
self.load_data(low_memory=DATA_PROCESSOR_SETTINGS['low_memory'])
|
||||
self.confine_data()
|
||||
|
||||
# TODO: CLean number of heated rooms and habitable rooms
|
||||
self.recast_df_columns(column_mappings=DATA_PROCESSOR_SETTINGS['column_mappings'])
|
||||
self.clean_multi_glaze_proportion()
|
||||
self.retain_multiple_epc_properties(epc_minimum_count=DATA_PROCESSOR_SETTINGS['epc_minimum_count'])
|
||||
|
|
@ -39,10 +41,11 @@ class DataProcessor:
|
|||
# Define a custom function to calculate the median, excluding missing values
|
||||
def median_without_missing(group):
|
||||
return group[AVERAGE_FIXED_FEATURES].median(skipna=True)
|
||||
|
||||
|
||||
cleaning_averages = self.data.groupby(
|
||||
["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
|
||||
observed=True
|
||||
observed=True,
|
||||
dropna=False
|
||||
).apply(median_without_missing).reset_index()
|
||||
|
||||
general_averages = self.data.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True).apply(
|
||||
|
|
|
|||
|
|
@ -42,8 +42,8 @@ def app():
|
|||
# Fixed features - these are property attributes that shouldn't change over time
|
||||
fixed_data = {}
|
||||
|
||||
# If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
|
||||
if max(modified_property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1:
|
||||
# If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
|
||||
if max(property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1:
|
||||
continue
|
||||
|
||||
# Map all anomaly values to None
|
||||
|
|
@ -70,7 +70,7 @@ def app():
|
|||
columns_to_merge_on = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS",
|
||||
"NUMBER_HEATED_ROOMS"]
|
||||
|
||||
if any(modified_property_data[columns_to_merge_on].isna()):
|
||||
if modified_property_data[columns_to_merge_on].isna().values.any():
|
||||
# If there are any NA value, back fill first (i.e most recent), then forward fill if needed
|
||||
modified_property_data[columns_to_merge_on] = modified_property_data[columns_to_merge_on].fillna(method='bfill').fillna(method='ffill')
|
||||
|
||||
|
|
@ -80,12 +80,14 @@ def app():
|
|||
|
||||
# Get the corresponding groupby and merge, and fill in NA values
|
||||
cleaning_averages_to_merge = cleaning_averages.groupby(columns_to_merge_on)[['TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT']].mean()
|
||||
|
||||
modified_property_data = pd.merge(modified_property_data, cleaning_averages_to_merge, on=columns_to_merge_on, suffixes=['', '_AVERAGE'])
|
||||
modified_property_data['TOTAL_FLOOR_AREA'] = modified_property_data['TOTAL_FLOOR_AREA'].fillna(modified_property_data['TOTAL_FLOOR_AREA_AVERAGE'])
|
||||
modified_property_data['FLOOR_HEIGHT'] = modified_property_data['FLOOR_HEIGHT'].fillna(modified_property_data['FLOOR_HEIGHT_AVERAGE'])
|
||||
modified_property_data = modified_property_data.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE'])
|
||||
|
||||
for field in AVERAGE_FIXED_FEATURES:
|
||||
|
||||
vals = list(modified_property_data[field].dropna().unique())
|
||||
if len(vals) > 1:
|
||||
# Check the values are too far apart
|
||||
|
|
@ -93,11 +95,9 @@ def app():
|
|||
if abs(vals[0] - vals[1]) / vals[0] > 0.1:
|
||||
# Take the more recent value since it's likely to be more accurate
|
||||
vals = [vals[-1]]
|
||||
|
||||
|
||||
if vals:
|
||||
field_value = np.mean(vals)
|
||||
|
||||
fixed_data[field] = field_value
|
||||
fixed_data[field] = np.mean(vals)
|
||||
|
||||
#Combine all fields together
|
||||
fixed_data.update(mandatory_field_data)
|
||||
|
|
@ -122,11 +122,6 @@ def app():
|
|||
rdsap_change = ending_record[RDSAP_RESPONSE] - starting_record[RDSAP_RESPONSE]
|
||||
heat_demand_change = ending_record[HEAT_DEMAND_RESPONSE] - starting_record[HEAT_DEMAND_RESPONSE]
|
||||
|
||||
# TODO: Should this be <= 0?
|
||||
if rdsap_change == 0:
|
||||
# Assumption: We aren't interested in records that exhibit no change
|
||||
continue
|
||||
|
||||
# TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and
|
||||
# floors, we may want to use the U-value. We may also want to handle the (assumed) tags
|
||||
# within descriptions
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue