mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
make sure all responses are positive, began adding temporal features
This commit is contained in:
parent
e5f4e96f00
commit
e516a6ac41
2 changed files with 51 additions and 116 deletions
|
|
@ -53,12 +53,6 @@ DEPLOYMENT_FOLDER = "deployment"
|
|||
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
|
||||
FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
|
||||
|
||||
# If we have multiple records for a numerical field, such as floor area,
|
||||
# we check the margine for error between the biggest and lowest values. If we see large
|
||||
# swings in measured values, we take the most recent value for this field as we interpret this
|
||||
# as inaccurate measurements in the past and use the most recent value
|
||||
MULTIPLE_VALUES_MARGIN_FOR_ERROR = 0.1
|
||||
|
||||
COLUMNS_TO_MERGE_ON = [
|
||||
"PROPERTY_TYPE",
|
||||
"BUILT_FORM",
|
||||
|
|
@ -109,12 +103,11 @@ COMPONENT_FEATURES = [
|
|||
"NUMBER_OPEN_FIREPLACES",
|
||||
"MAINHEATCONT_DESCRIPTION",
|
||||
"EXTENSION_COUNT",
|
||||
"TOTAL_FLOOR_AREA",
|
||||
"FLOOR_HEIGHT",
|
||||
# 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION
|
||||
]
|
||||
|
||||
# For these fields, we take an average if we have multiple values
|
||||
AVERAGE_FIXED_FEATURES = ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]
|
||||
|
||||
# For these fields, we take the latest value if we have multiple values
|
||||
# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
|
||||
# the most accurate
|
||||
|
|
|
|||
|
|
@ -5,13 +5,11 @@ from tqdm import tqdm
|
|||
from pathlib import Path
|
||||
from simulation_system.core.Settings import (
|
||||
MANDATORY_FIXED_FEATURES,
|
||||
AVERAGE_FIXED_FEATURES,
|
||||
LATEST_FIELD,
|
||||
COMPONENT_FEATURES,
|
||||
RDSAP_RESPONSE,
|
||||
HEAT_DEMAND_RESPONSE,
|
||||
COLUMNS_TO_MERGE_ON,
|
||||
MULTIPLE_VALUES_MARGIN_FOR_ERROR,
|
||||
)
|
||||
from simulation_system.core.DataProcessor import DataProcessor
|
||||
from utils import save_dataframe_to_s3_parquet
|
||||
|
|
@ -19,9 +17,6 @@ from utils import save_dataframe_to_s3_parquet
|
|||
DATA_DIRECTORY = Path(__file__).parent / "simulation_system" / "data" / "all-domestic-certificates"
|
||||
|
||||
|
||||
# TODO: Have a look at temporal features
|
||||
|
||||
|
||||
def app():
|
||||
# Get all the files in the directory
|
||||
|
||||
|
|
@ -34,58 +29,20 @@ def app():
|
|||
dataset = []
|
||||
cleaning_dataset = []
|
||||
|
||||
# TODO: Does energy tariff make a difference
|
||||
# TODO: If SAP hasn't changed, we don't include the record
|
||||
# TODO: Floor area will impact the EPC so instead of averaging, we should have a starting and ending value.
|
||||
# TODO: Same as floor area for floor height
|
||||
# TODO: If fundamental building fabric changes, we should proabably discard the record
|
||||
# TODO: Should we prune records that have an exceptionally large amount of time between them?
|
||||
# TODO: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections?
|
||||
# TODO [x] : Does energy tariff make a difference
|
||||
# - leave for now but it may not
|
||||
# TODO [x] : If SAP hasn't changed, we don't include the record
|
||||
# TODO [x]: If SAP gets worse, it genuinely looks like in the vast majority of cases that the building looks
|
||||
# worse in the newer epc, so we can switch the orders
|
||||
# TODO [] : Have a look at temporal features
|
||||
# TODO [x] : Floor area will impact the EPC so instead of averaging, we should have a starting and ending value.
|
||||
# TODO [x]: Same as floor area for floor height
|
||||
# TODO []: If fundamental building fabric changes, we should proabably discard the record
|
||||
# TODO [x]: Should we prune records that have an exceptionally large amount of time between them?
|
||||
# - leave for now and check performance after temporal features
|
||||
# TODO [x]: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections?
|
||||
# - Leave for now
|
||||
#
|
||||
# TODO: REMOVE ME
|
||||
dodgy_uprns = []
|
||||
observed_uprns = [
|
||||
"10002082244", # Doesn't really make sense, house no longer has lel and not has more insulation but lower score
|
||||
"10002082259",
|
||||
# Property has more roof insulation, lel, but now the floor isn't insulated and has a lower score. Also the
|
||||
# floor assessment is now assumed whereas before it wasnt
|
||||
"10002082418", # Walls went from insulated to not...
|
||||
"10002082640", # Property identical besides different energy taffiff
|
||||
"10002082830", # Lots of records going from not insulated to insulated but some parts of
|
||||
# the property has gotten better
|
||||
"10002083244", # latest epc indicates the property is worse
|
||||
"10002083592", # lastest epc doesn't have a fuel system present, but has slightly more insulation. Also the
|
||||
# floor type has changed from solid to syspended. lel has decreased
|
||||
"100030533576", # property slightly worse, has less lels and the floor description has changed type
|
||||
"100030533668", # has slightly less lels. Glazed type is now missing
|
||||
"100030533803", # Not super clea why this is lower, newer epc has more lel but is using second heating
|
||||
"100030534016", # Property has less lel but more roof insulation. Floor type has changed
|
||||
"100030534040", # property has less lel and the floor type has changed
|
||||
"100030534041", # property has less insulation and less lel
|
||||
"100030534243", # Cavity wall has gone from filled to unfilled
|
||||
"100030534294", # less roof insulation but now has an air source heat pump
|
||||
"100030534322", # identical between records but now with higher lel but no change recorded
|
||||
"100030534413", # identical between records but different energy tariff, no sap change
|
||||
"100030534437", # property has less lel and the mainheating no longer has a programmer and trvs
|
||||
"100030534569", # Cavity wall no longer filled, 30mm more roof insulation in newest epc
|
||||
"100030534676", # Property has less lel, is now using secondary heating, has 50mm less roof insulation, but
|
||||
# the wall cavity is no longer filled
|
||||
"100030534732", # property has higher lel %. Not clear why this is worse, glazing type has changed.
|
||||
# This looks dodgy has the UPRN_SOURCE is address matched also the floor area has increased from the first to
|
||||
# the later epc
|
||||
"100030534791", # Property has started using secondary heating - the EPCs are taken on the same day so maybe we
|
||||
# should discard
|
||||
"100030534795", # More lel but a lot less insulation. This is a very dodgy record, sap has gone from 90 to 66
|
||||
# The newer epc indicates the property now has 40% photo supply so this doesn't make much sense
|
||||
"100030534897", # Roof has gone from thatched with additional insulation to pitched with insulation,
|
||||
# sap score hasn't changed
|
||||
"100030534986", # Property has gone from 300mm loft insulation to none. has 2% higher lel (negligible) and
|
||||
# slightly better main heating setup
|
||||
"100030535043", # Property lel increased by 12%, not clear why sap worse. Maybe due to different floor area and
|
||||
# wall height
|
||||
"100030535173", # lel increased from 20% to 80% but roof gone from 100m insulation to "limited" insulation
|
||||
"100030535244", # lel gone from 100% to 0%, sap is the same
|
||||
]
|
||||
|
||||
for directory in tqdm(directories):
|
||||
|
||||
|
|
@ -121,18 +78,6 @@ def app():
|
|||
cols_to_merge_on=COLUMNS_TO_MERGE_ON
|
||||
)
|
||||
|
||||
for field in AVERAGE_FIXED_FEATURES:
|
||||
|
||||
vals = list(modified_property_data[field].dropna().unique())
|
||||
if len(vals) > 1:
|
||||
lowest_value = min(vals)
|
||||
largest_value = max(vals)
|
||||
if abs(largest_value - lowest_value) / lowest_value > MULTIPLE_VALUES_MARGIN_FOR_ERROR:
|
||||
# Take the more recent value since it's likely to be more accurate
|
||||
vals = [vals[-1]]
|
||||
|
||||
fixed_data[field] = np.mean(vals)
|
||||
|
||||
# Combine all fields together
|
||||
fixed_data.update(mandatory_field_data)
|
||||
fixed_data.update(latest_field_data)
|
||||
|
|
@ -152,46 +97,28 @@ def app():
|
|||
if idx >= modified_property_data.shape[0] - 1:
|
||||
break
|
||||
|
||||
starting_record = variable_data.iloc[idx]
|
||||
ending_record = variable_data.iloc[idx + 1]
|
||||
rdsap_change = (
|
||||
ending_record[RDSAP_RESPONSE] - starting_record[RDSAP_RESPONSE]
|
||||
)
|
||||
heat_demand_change = (
|
||||
ending_record[HEAT_DEMAND_RESPONSE]
|
||||
- starting_record[HEAT_DEMAND_RESPONSE]
|
||||
)
|
||||
earliest_record = variable_data.iloc[idx]
|
||||
latest_record = variable_data.iloc[idx + 1]
|
||||
|
||||
# Check for a change in the starting and ending record
|
||||
check_cols = [
|
||||
col for col in starting_record.index if col not in [
|
||||
"LODGEMENT_DATE", "CURRENT_ENERGY_EFFICIENCY", "ENERGY_CONSUMPTION_CURRENT", "ENERGY_TARIFF"
|
||||
]
|
||||
]
|
||||
all_same = True
|
||||
for col in check_cols:
|
||||
if starting_record[col] != ending_record[col]:
|
||||
all_same = False
|
||||
break
|
||||
# Check if the sap gets better or worse
|
||||
gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE]
|
||||
|
||||
if rdsap_change <= 0:
|
||||
if all_same | (uprn in observed_uprns):
|
||||
if uprn not in observed_uprns:
|
||||
dodgy_uprns.append(uprn)
|
||||
else:
|
||||
compare = pd.concat([starting_record, ending_record], axis=1)
|
||||
bljd
|
||||
if gets_better:
|
||||
rdsap_change = latest_record[RDSAP_RESPONSE] - earliest_record[RDSAP_RESPONSE]
|
||||
heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - earliest_record[HEAT_DEMAND_RESPONSE]
|
||||
else:
|
||||
rdsap_change = earliest_record[RDSAP_RESPONSE] - latest_record[RDSAP_RESPONSE]
|
||||
heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - latest_record[HEAT_DEMAND_RESPONSE]
|
||||
|
||||
# TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and
|
||||
# floors, we may want to use the U-value. We may also want to handle the (assumed) tags
|
||||
# within descriptions
|
||||
if rdsap_change == 0:
|
||||
continue
|
||||
|
||||
starting_record = starting_record[
|
||||
COMPONENT_FEATURES + ["LODGEMENT_DATE"]
|
||||
].add_suffix("_STARTING")
|
||||
ending_record = ending_record[
|
||||
COMPONENT_FEATURES + ["LODGEMENT_DATE"]
|
||||
].add_suffix("_ENDING")
|
||||
if gets_better:
|
||||
starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
||||
ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
||||
else:
|
||||
starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
||||
ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
||||
|
||||
features = pd.concat([starting_record, ending_record])
|
||||
|
||||
|
|
@ -205,7 +132,18 @@ def app():
|
|||
}
|
||||
)
|
||||
|
||||
dataset.append(property_model_data)
|
||||
property_model_df = pd.DataFrame(property_model_data)
|
||||
# Add some temporal features - we look at the days from the standard starting point in time
|
||||
# for the starting and ending date so all records are from a fixed point
|
||||
# TODO: implement me
|
||||
property_model_df["DAYS_TO_STARTING"] = None
|
||||
property_model_df["DAYS_TO_ENDING"] = None
|
||||
|
||||
dataset.append(property_model_df)
|
||||
|
||||
# TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and
|
||||
# floors, we may want to use the U-value. We may also want to handle the (assumed) tags
|
||||
# within descriptions
|
||||
|
||||
cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]
|
||||
cleaning_dataset.append(cleaning_averages)
|
||||
|
|
@ -218,8 +156,12 @@ def app():
|
|||
file_key="sap_change_model/cleaning_dataset.parquet",
|
||||
)
|
||||
|
||||
output = pd.DataFrame(dataset)
|
||||
output.to_parquet("./dataset.parquet")
|
||||
output = pd.concat(dataset)
|
||||
save_dataframe_to_s3_parquet(
|
||||
df=output,
|
||||
bucket_name="retrofit-data-dev",
|
||||
file_key="sap_change_model/dataset.parquet",
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue