Model/etl/epc/settings.py
2023-10-20 11:46:38 +11:00

287 lines
12 KiB
Python

# Using a simply python file as settings for now
# TODO: migrate to dynaconf
from pathlib import Path
METRIC_FILENAME = "metrics.csv"
OPTIMISE_METRIC = "mean_absolute_error"
BEST_MODEL_COLUMN_NAME = "best_model"
# TODO: remove these setting elsewhere for CML
RESIDUAL_TRUE_LABEL = "true"
RESIDUAL_PREDICTION_LABEL = "pred"
RESIDUAL_FILE = "residual.png"
SEABORN_RESIDUAL_AXIS_FONTSIZE = 12
SEABORN_RESIDUAL_TITLE_FONTSIZE = 22
SEABORN_RESIDUAL_STYLE = "whitegrid"
SEABORN_RESIDUAL_ASPECT_RATIO = "equal"
SEABORN_RESIDUAL_PLOT_DPI = 120
SEABORN_RESIDUAL_RANGE = [-100, 100]
SEABORN_RESIDUAL_LINE_COLOUR = "black"
SEABORN_RESIDUAL_LINE_WIDTH = 1
# Can move to a hyperparmeters file
# If anything we might want to have a file that can be loaded and sent to this script
MODEL_HYPERPARAMETERS = {
"autogluon": {
"problem_type": "regression",
"eval_metric": "mean_absolute_error",
"time_limit": 45,
"presets": "medium_quality",
"excluded_model_types": None,
}
}
TIMESTAMP_FORMAT = "%Y_%m_%d_%H_%M_%S"
RANDOM_SEED = 0
SUBSAMPLE_FACTOR = 200
TRAIN_AND_VALIDATION_DATA_NAME = "train_validation_data.parquet"
TEST_DATA_NAME = "test_data.parquet"
REGISTRY_FILE = "model_registry.csv"
MODEL_DIRECTORY = "model_directory"
BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY
PREDICTION_LOCATION = Path("predictions")
PREDICTION_FILE = "prediction.json"
METADATA_FILE = "metadata.json"
MODEL_FOLDER = "model"
METRICS_FOLDER = "metrics"
DEPLOYMENT_FOLDER = "deployment"
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
AVERAGE_FIXED_FEATURES = [
"TOTAL_FLOOR_AREA",
"FLOOR_HEIGHT",
"FIXED_LIGHTING_OUTLETS_COUNT",
]
COLUMNS_TO_MERGE_ON = [
"PROPERTY_TYPE",
"BUILT_FORM",
"CONSTRUCTION_AGE_BAND",
"NUMBER_HABITABLE_ROOMS",
"NUMBER_HEATED_ROOMS",
]
FULLY_GLAZED_DESCRIPTIONS = [
"Fully double glazed",
"High performance glazing",
"Fully triple glazed",
"Full secondary glazing",
"Multiple glazing throughout",
]
FIXED_FEATURES = [
"PROPERTY_TYPE",
"BUILT_FORM",
"CONSTRUCTION_AGE_BAND",
"NUMBER_HABITABLE_ROOMS",
"CONSTITUENCY",
"NUMBER_HEATED_ROOMS",
"FIXED_LIGHTING_OUTLETS_COUNT",
]
CORE_COMPONENT_FEATURES = [
"WALLS_DESCRIPTION",
"FLOOR_DESCRIPTION",
"LIGHTING_DESCRIPTION",
"ROOF_DESCRIPTION",
"MAINHEAT_DESCRIPTION",
"HOTWATER_DESCRIPTION",
"MAIN_FUEL",
"MECHANICAL_VENTILATION",
"SECONDHEAT_DESCRIPTION",
"WINDOWS_DESCRIPTION",
"GLAZED_TYPE",
"MULTI_GLAZE_PROPORTION",
"LOW_ENERGY_LIGHTING",
"NUMBER_OPEN_FIREPLACES",
"MAINHEATCONT_DESCRIPTION",
"SOLAR_WATER_HEATING_FLAG",
"PHOTO_SUPPLY",
]
EFFICIENCY_FEATURES = [
'HOT_WATER_ENERGY_EFF',
'FLOOR_ENERGY_EFF',
'WINDOWS_ENERGY_EFF',
'WALLS_ENERGY_EFF',
'SHEATING_ENERGY_EFF',
'ROOF_ENERGY_EFF',
'MAINHEAT_ENERGY_EFF',
'MAINHEATC_ENERGY_EFF',
'LIGHTING_ENERGY_EFF'
]
COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [
"TRANSACTION_TYPE",
"ENERGY_TARIFF", # Not sure if this is relevant
"EXTENSION_COUNT",
"TOTAL_FLOOR_AREA",
"FLOOR_HEIGHT",
# 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION
]
POTENTIAL_COLUMNS = [
'POTENTIAL_ENERGY_EFFICIENCY',
'ENVIRONMENT_IMPACT_POTENTIAL',
'ENERGY_CONSUMPTION_POTENTIAL',
'CO2_EMISSIONS_POTENTIAL',
# We don't include cost features for the moment
# 'LIGHTING_COST_POTENTIAL',
# 'HEATING_COST_POTENTIAL',
# 'HOT_WATER_COST_POTENTIAL'
]
# For these fields, we take the latest value if we have multiple values
# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
# the most accurate
LATEST_FIELD = [
"NUMBER_HABITABLE_ROOMS",
"NUMBER_HEATED_ROOMS",
"FIXED_LIGHTING_OUTLETS_COUNT",
"CONSTRUCTION_AGE_BAND", # This is a field we're probably want to use verisk data for
]
# If we see thee features changing, we don't use the EPC, since deem it not to be reliable
MANDATORY_FIXED_FEATURES = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTITUENCY"]
# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
# and Wales from 31 July 2014
EARLIEST_EPC_DATE = "2014-08-01"
RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT"
BUILT_FORM_REMAP = {
"Enclosed End-Terrace": "End-Terrace",
"Enclosed Mid-Terrace": "Mid-Terrace",
}
DATA_PROCESSOR_SETTINGS = {
"low_memory": False,
"epc_minimum_count": 1,
"column_mappings": {"UPRN": [int, str]},
}
# This has a manual mapping of the column types required
COLUMNTYPES = {
'UPRN': 'object', 'TOTAL_FLOOR_AREA': 'float64', 'FLOOR_HEIGHT': 'float64', 'PROPERTY_TYPE': 'object',
'BUILT_FORM': 'object', 'CONSTITUENCY': 'object', 'NUMBER_HABITABLE_ROOMS': 'float64',
'NUMBER_HEATED_ROOMS': 'float64', 'FIXED_LIGHTING_OUTLETS_COUNT': 'float64',
'CONSTRUCTION_AGE_BAND': 'object',
'TRANSACTION_TYPE': 'object',
'WALLS_DESCRIPTION': 'object',
'FLOOR_DESCRIPTION': 'object',
'LIGHTING_DESCRIPTION': 'object',
'ROOF_DESCRIPTION': 'object',
'MAINHEAT_DESCRIPTION': 'object',
'HOTWATER_DESCRIPTION': 'object', 'MAIN_FUEL': 'object',
'MECHANICAL_VENTILATION': 'object',
'SECONDHEAT_DESCRIPTION': 'object', 'ENERGY_TARIFF': 'object',
'SOLAR_WATER_HEATING_FLAG': 'object', 'PHOTO_SUPPLY': 'float64',
'WINDOWS_DESCRIPTION': 'object',
'GLAZED_TYPE': 'object',
'MULTI_GLAZE_PROPORTION': 'float64',
'LOW_ENERGY_LIGHTING': 'float64',
'NUMBER_OPEN_FIREPLACES': 'float64',
'MAINHEATCONT_DESCRIPTION': 'object',
'EXTENSION_COUNT': 'float64',
'LODGEMENT_DATE': 'object',
**dict(zip(EFFICIENCY_FEATURES, ['object', ] * len(EFFICIENCY_FEATURES))),
**dict(zip(POTENTIAL_COLUMNS, ['float64', ] * len(POTENTIAL_COLUMNS)))
}
# For modelling, we don't allow records with more than 100 SAP points
MAX_SAP_SCORE = 100
fill_na_map = {
# There are some descriptions, such as "To be used only when there is no heating/hot-water system or data is from
# a community network" that could be clustered with unknown fuel
"MAIN_FUEL": "UNKNOWN",
"MECHANICAL_VENTILATION": "Unknown",
"SECONDHEAT_DESCRIPTION": "None",
"ENERGY_TARIFF": "Unknown",
# We set solar water heating flag to N - we could investigate using a different category entirely
"SOLAR_WATER_HEATING_FLAG": "N",
"GLAZED_TYPE": "not defined",
"MULTI_GLAZE_PROPORTION": 0,
"LOW_ENERGY_LIGHTING": 0,
"MAINHEATCONT_DESCRIPTION": "Unknown",
"EXTENSION_COUNT": 0,
"NUMBER_OPEN_FIREPLACES": 0
}
################################################################################################
# These are the features we need for scoring
# We'll likely change how we do this in the future
################################################################################################
STARTING_SUFFIX_COMPONENT_COLS = [
"SAP", "HEAT_DEMAND", "CARBON", "TRANSACTION_TYPE", "MECHANICAL_VENTILATION",
"SECONDHEAT_DESCRIPTION", "ENERGY_TARIFF", "SOLAR_WATER_HEATING_FLAG", "PHOTO_SUPPLY",
"GLAZED_TYPE", "MULTI_GLAZE_PROPORTION", "LOW_ENERGY_LIGHTING", "NUMBER_OPEN_FIREPLACES",
"EXTENSION_COUNT", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", "DAYS_TO", "estimated_perimeter"
]
NO_SUFFIX_COMPONENT_COLS = ['walls_thermal_transmittance', 'is_cavity_wall',
'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame',
'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_sandstone_or_limestone',
'is_park_home', 'walls_insulation_thickness', 'external_insulation', 'internal_insulation',
'floor_thermal_transmittance', 'is_to_unheated_space', 'is_to_external_air', 'is_suspended',
'is_solid', 'another_property_below', 'floor_insulation_thickness',
'roof_thermal_transmittance', 'is_pitched', 'is_roof_room', 'is_loft', 'is_flat',
'is_thatched', 'is_at_rafters', 'has_dwelling_above', 'roof_insulation_thickness',
'heater_type', 'system_type', 'thermostat_characteristics', 'heating_scope',
'energy_recovery',
'hotwater_tariff_type', 'extra_features', 'chp_systems', 'distribution_system',
'no_system_present', 'appliance', 'has_radiators', 'has_fan_coil_units',
'has_pipes_in_screed_above_insulation', 'has_pipes_in_insulated_timber_floor',
'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', 'has_room_heaters',
'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating',
'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump',
'has_no_system_present', 'has_portable_electric_heaters', 'has_water_source_heat_pump',
'has_electric_heat_pump', 'has_micro-cogeneration', 'has_solar_assisted_heat_pump',
'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', 'has_mains_gas',
'has_wood_logs', 'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite',
'has_dual_fuel_mineral_and_wood', 'has_smokeless_fuel', 'has_lpg', 'has_b30k',
'has_electricaire', 'has_assumed_for_most_rooms', 'has_underfloor_heating',
'thermostatic_control', 'charging_system', 'switch_system', 'no_control', 'dhw_control',
'community_heating', 'multiple_room_thermostats', 'auxiliary_systems', 'trvs',
'rate_control',
'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community',
'no_individual_heating_or_community_network', 'complex_fuel_type',
]
ENDING_SUFFIX_COMPONENT_COLS = [
'SAP', 'HEAT_DEMAND', 'CARBON', 'TRANSACTION_TYPE', 'MECHANICAL_VENTILATION', 'SECONDHEAT_DESCRIPTION',
'ENERGY_TARIFF', 'SOLAR_WATER_HEATING_FLAG', 'PHOTO_SUPPLY', 'GLAZED_TYPE', 'MULTI_GLAZE_PROPORTION',
'LOW_ENERGY_LIGHTING', 'NUMBER_OPEN_FIREPLACES', 'EXTENSION_COUNT', 'TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT',
'DAYS_TO', 'walls_thermal_transmittance', 'is_park_home', 'walls_insulation_thickness',
'external_insulation', 'internal_insulation', 'floor_thermal_transmittance', 'floor_insulation_thickness',
'roof_thermal_transmittance', 'roof_insulation_thickness', 'heater_type', 'system_type',
'thermostat_characteristics', 'heating_scope', 'energy_recovery', 'hotwater_tariff_type', 'extra_features',
'chp_systems', 'distribution_system', 'no_system_present', 'appliance', 'has_radiators',
'has_fan_coil_units', 'has_pipes_in_screed_above_insulation', 'has_pipes_in_insulated_timber_floor',
'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', 'has_room_heaters',
'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating',
'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump',
'has_no_system_present', 'has_portable_electric_heaters', 'has_water_source_heat_pump',
'has_electric_heat_pump', 'has_micro-cogeneration', 'has_solar_assisted_heat_pump',
'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', 'has_mains_gas', 'has_wood_logs',
'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite', 'has_dual_fuel_mineral_and_wood',
'has_smokeless_fuel', 'has_lpg', 'has_b30k', 'has_electricaire', 'has_assumed_for_most_rooms',
'has_underfloor_heating', 'thermostatic_control', 'charging_system', 'switch_system', 'no_control',
'dhw_control', 'community_heating', 'multiple_room_thermostats', 'auxiliary_systems', 'trvs',
'rate_control', 'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community',
'no_individual_heating_or_community_network', 'complex_fuel_type', 'estimated_perimeter'
]
# We found that without performing any filtering, the bottom 0.5% of homes had a floor height of 1.65m. We'll therefore
# filter out any homes with a floor height below this
MINIMUM_FLOOR_HEIGHT = 1.65