# Using a simply python file as settings for now # TODO: migrate to dynaconf from pathlib import Path DATA_ANOMALY_MATCHES = { # Invalid reports are where the value provided is out of bounds, e.g. a negative energy rating of -1199 or a # non-integer, there is no valid energy band for this, so it is marked as INVALID! "INVALID", "INVALID!", # When the energy certificate was first lodged on the register there was no requirement to lodge this data # item, i.e. a non-mandatory item. "NO DATA!", "NODATA!", # When the energy certificate was first lodged on the register there was no requirement to lodge this data item, # i.e.a non - mandatory item. "N/A", # A value generated by the register to account for a data item that was not mandatory when the lodgement of # the energy certificate occurred. When the data item became mandatory the register operator, for backwards # compatibility purposes, populated the data field with a value of ‘not recorded’ to ensure that the energy # certificate retrieval process is successfully completed. Mandatory data items cannot be applied # retrospectively to energy certificates lodged before the date of the change. "Not recorded", "Not Recorded", # The data also contains DECs with an operational rating of ‘9999’ (a ‘default’ DEC). The production of a # ‘default’ DEC value was allowed to enable building occupiers, with poor quality or no energy data, # the opportunity to comply with the regulations. From April 2011 the ability to lodge a ‘default’ DEC was no # longer allowed. "9999", # The Building Emission Rate (BER) data field for non-domestic buildings may contain a ‘blank’ value. The BER # was only lodged on the register from 7 March 2010. "Blank" # There are currently just over 8,600 records where the local authority identifier is ‘null’. This is due to # the Register Operator not being able to match the building address in the Markermap Ordinance Survey (GB) # lookup tables or OS MasterMap Address Layer 2 data. The majority of these addresses have been requested # manually by energy assessors for inclusion by the Register Operator in the registers (e.g. new builds, # etc). These records are being published for completeness. An ongoing process to manage these manually added # addresses will take time to develop to deal with these and future anomalies. # # There are several fields within the lodged data where it is possible to enter multiple entries to cater for # different data_types of build within a single property, i.e. extensions. This results in multiple entries for # the description fields for floor, roof and wall. For the purposes of this data release only the information # contained within the first of these multiple entries is being provided. As there are no restrictions on the # value in this first field it means that sometimes the first field in a multiple entry description field may # contain a ‘null’ value. A resolution to correct these anomalies will be considered for future data releases. "NULL", # We sometimes see fields populated with just an empty string. "", # We sometimes find None values - particulatly when we produce an estimated EPC None, # An older value which rarely shows up but has been seen in the data. "UNKNOWN", # "Unknown", # Observed error cases "(error), (error)", "error , error", "Description", "description", "Undefined Welsh description for crtrl code 2113", "undefined welsh description for crtrl code 2113", "Hot water system", "hot water system", "Heating system", "heating system", } # Add the post_sap10 date to indicate if the epc is post sap10 POST_SAP10_DATE = "2025-06-22" DATA_ANOMALY_SUBSTRINGS = { # Where values in a ‘pick’ list that have been superseded by another value. For example, where a value for # ‘pitched roof’ has been replaced by three sub-categories of pitched roof. The original value is retained # but ‘for backward compatibility only’ it is appended to ensure that the energy certificate retrieval # process can be successfully completed. Replacement data items cannot be applied retrospectively to energy # certificates lodged on the register before the date of the change. "for backward compatibility only" } METRIC_FILENAME = "metrics.csv" OPTIMISE_METRIC = "mean_absolute_error" BEST_MODEL_COLUMN_NAME = "best_model" # TODO: remove these setting elsewhere for CML RESIDUAL_TRUE_LABEL = "true" RESIDUAL_PREDICTION_LABEL = "pred" RESIDUAL_FILE = "residual.png" SEABORN_RESIDUAL_AXIS_FONTSIZE = 12 SEABORN_RESIDUAL_TITLE_FONTSIZE = 22 SEABORN_RESIDUAL_STYLE = "whitegrid" SEABORN_RESIDUAL_ASPECT_RATIO = "equal" SEABORN_RESIDUAL_PLOT_DPI = 120 SEABORN_RESIDUAL_RANGE = [-100, 100] SEABORN_RESIDUAL_LINE_COLOUR = "black" SEABORN_RESIDUAL_LINE_WIDTH = 1 # Can move to a hyperparmeters file # If anything we might want to have a file that can be loaded and sent to this script MODEL_HYPERPARAMETERS = { "autogluon": { "problem_type": "regression", "eval_metric": "mean_absolute_error", "time_limit": 45, "presets": "medium_quality", "excluded_model_types": None, } } TIMESTAMP_FORMAT = "%Y_%m_%d_%H_%M_%S" RANDOM_SEED = 0 SUBSAMPLE_FACTOR = 200 TRAIN_AND_VALIDATION_DATA_NAME = "train_validation_data.parquet" TEST_DATA_NAME = "test_data.parquet" REGISTRY_FILE = "model_registry.csv" MODEL_DIRECTORY = "model_directory" BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY PREDICTION_LOCATION = Path("predictions") PREDICTION_FILE = "prediction.json" METADATA_FILE = "metadata.json" MODEL_FOLDER = "model" METRICS_FOLDER = "metrics" DEPLOYMENT_FOLDER = "deployment" TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70 FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45 COST_FEATURES = [ "LIGHTING_COST_CURRENT", "HEATING_COST_CURRENT", "HOT_WATER_COST_CURRENT", ] AVERAGE_FIXED_FEATURES = [ "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", "FIXED_LIGHTING_OUTLETS_COUNT", ] COLUMNS_TO_MERGE_ON = [ "PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", ] FULLY_GLAZED_DESCRIPTIONS = [ "Fully double glazed", "High performance glazing", "Fully triple glazed", "Full secondary glazing", "Multiple glazing throughout", ] FIXED_FEATURES = [ "PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "CONSTITUENCY", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", ] CORE_COMPONENT_FEATURES = [ "WALLS_DESCRIPTION", "FLOOR_DESCRIPTION", "LIGHTING_DESCRIPTION", "ROOF_DESCRIPTION", "MAINHEAT_DESCRIPTION", "HOTWATER_DESCRIPTION", "MAIN_FUEL", "MECHANICAL_VENTILATION", "SECONDHEAT_DESCRIPTION", "WINDOWS_DESCRIPTION", "GLAZED_TYPE", "MULTI_GLAZE_PROPORTION", "LOW_ENERGY_LIGHTING", "NUMBER_OPEN_FIREPLACES", "MAINHEATCONT_DESCRIPTION", "SOLAR_WATER_HEATING_FLAG", "PHOTO_SUPPLY", ] EFFICIENCY_FEATURES = [ "HOT_WATER_ENERGY_EFF", "FLOOR_ENERGY_EFF", "WINDOWS_ENERGY_EFF", "WALLS_ENERGY_EFF", "SHEATING_ENERGY_EFF", "ROOF_ENERGY_EFF", "MAINHEAT_ENERGY_EFF", "MAINHEATC_ENERGY_EFF", "LIGHTING_ENERGY_EFF", ] ROOM_FEATURES = ["number_habitable_rooms", "number_heated_rooms"] POST_SAP10_FEATURE = ["is_post_sap10"] COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [ "TRANSACTION_TYPE", "ENERGY_TARIFF", # Not sure if this is relevant "EXTENSION_COUNT", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", # 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION ] POTENTIAL_COLUMNS = [ "POTENTIAL_ENERGY_EFFICIENCY", "ENVIRONMENT_IMPACT_POTENTIAL", "ENERGY_CONSUMPTION_POTENTIAL", "CO2_EMISSIONS_POTENTIAL", # We don't include cost features for the moment # 'LIGHTING_COST_POTENTIAL', # 'HEATING_COST_POTENTIAL', # 'HOT_WATER_COST_POTENTIAL' ] # For these fields, we take the latest value if we have multiple values # Since more recent EPCs have been conducted with more rigour, we assume that the latest value is # the most accurate LATEST_FIELD = [ "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS", "FIXED_LIGHTING_OUTLETS_COUNT", "CONSTRUCTION_AGE_BAND", # This is a field we're probably want to use verisk data for ] # If we see thee features changing, we don't use the EPC, since deem it not to be reliable MANDATORY_FIXED_FEATURES = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTITUENCY"] # For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were # conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England # and Wales from 31 July 2014 EARLIEST_EPC_DATE = "2014-08-01" IGNORED_TRANSACTION_TYPES = "new dwelling" IGNORED_FLOOR_LEVELS = ["top floor", "mid floor"] IGNORED_PROPERTY_TYPES = "Park home" IGNORED_TENURES = [ "Not defined - use in the case of a new dwelling for which the intended tenure in not known. It is not to be used " "for an existing dwelling" ] RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY" HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT" CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT" BUILT_FORM_REMAP = { "Enclosed End-Terrace": "End-Terrace", "Enclosed Mid-Terrace": "Mid-Terrace", } DATA_PROCESSOR_SETTINGS = { "low_memory": False, "epc_minimum_count": 1, "column_mappings": { "UPRN": [int, str], "NUMBER_HEATED_ROOMS": [float], "NUMBER_HABITABLE_ROOMS": [float], }, } # This has a manual mapping of the column types required COLUMNTYPES = { "UPRN": "object", "TOTAL_FLOOR_AREA": "float64", "FLOOR_HEIGHT": "float64", "PROPERTY_TYPE": "object", "BUILT_FORM": "object", "CONSTITUENCY": "object", "NUMBER_HABITABLE_ROOMS": "float64", "NUMBER_HEATED_ROOMS": "float64", "FIXED_LIGHTING_OUTLETS_COUNT": "float64", "CONSTRUCTION_AGE_BAND": "object", "TRANSACTION_TYPE": "object", "WALLS_DESCRIPTION": "object", "FLOOR_DESCRIPTION": "object", "LIGHTING_DESCRIPTION": "object", "ROOF_DESCRIPTION": "object", "MAINHEAT_DESCRIPTION": "object", "HOTWATER_DESCRIPTION": "object", "MAIN_FUEL": "object", "MECHANICAL_VENTILATION": "object", "SECONDHEAT_DESCRIPTION": "object", "ENERGY_TARIFF": "object", "SOLAR_WATER_HEATING_FLAG": "object", "PHOTO_SUPPLY": "float64", "WINDOWS_DESCRIPTION": "object", "GLAZED_TYPE": "object", "MULTI_GLAZE_PROPORTION": "float64", "LOW_ENERGY_LIGHTING": "float64", "NUMBER_OPEN_FIREPLACES": "float64", "MAINHEATCONT_DESCRIPTION": "object", "EXTENSION_COUNT": "float64", "LODGEMENT_DATE": "object", **dict( zip( EFFICIENCY_FEATURES, [ "object", ] * len(EFFICIENCY_FEATURES), ) ), **dict( zip( POTENTIAL_COLUMNS, [ "float64", ] * len(POTENTIAL_COLUMNS), ) ), } # For modelling, we don't allow records with more than 100 SAP points MAX_SAP_SCORE = 100 fill_na_map = { # There are some descriptions, such as "To be used only when there is no heating/hot-water system or data is from # a community network" that could be clustered with unknown fuel "MAIN_FUEL": "UNKNOWN", "MECHANICAL_VENTILATION": "Unknown", "SECONDHEAT_DESCRIPTION": "None", "ENERGY_TARIFF": "Unknown", # We set solar water heating flag to N - we could investigate using a different category entirely "SOLAR_WATER_HEATING_FLAG": "N", "GLAZED_TYPE": "not defined", "MULTI_GLAZE_PROPORTION": 0, "LOW_ENERGY_LIGHTING": 0, "MAINHEATCONT_DESCRIPTION": "Unknown", "EXTENSION_COUNT": 0, "NUMBER_OPEN_FIREPLACES": 0, } ################################################################################################ # These are the features we need for scoring # We'll likely change how we do this in the future ################################################################################################ STARTING_SUFFIX_COMPONENT_COLS = [ "SAP", "HEAT_DEMAND", "CARBON", "TRANSACTION_TYPE", "MECHANICAL_VENTILATION", "SECONDHEAT_DESCRIPTION", "ENERGY_TARIFF", "SOLAR_WATER_HEATING_FLAG", "PHOTO_SUPPLY", "GLAZED_TYPE", "MULTI_GLAZE_PROPORTION", "LOW_ENERGY_LIGHTING", "NUMBER_OPEN_FIREPLACES", "EXTENSION_COUNT", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", "DAYS_TO", "estimated_perimeter", ] NO_SUFFIX_COMPONENT_COLS = [ "walls_thermal_transmittance", "is_cavity_wall", "is_filled_cavity", "is_solid_brick", "is_system_built", "is_timber_frame", "is_granite_or_whinstone", "is_as_built", "is_cob", "is_sandstone_or_limestone", "is_park_home", "walls_insulation_thickness", "external_insulation", "internal_insulation", "floor_thermal_transmittance", "is_to_unheated_space", "is_to_external_air", "is_suspended", "is_solid", "another_property_below", "floor_insulation_thickness", "roof_thermal_transmittance", "is_pitched", "is_roof_room", "is_loft", "is_flat", "is_thatched", "is_at_rafters", "has_dwelling_above", "roof_insulation_thickness", "heater_type", "system_type", "thermostat_characteristics", "heating_scope", "energy_recovery", "hotwater_tariff_type", "extra_features", "chp_systems", "distribution_system", "no_system_present", "appliance", "has_radiators", "has_fan_coil_units", "has_pipes_in_screed_above_insulation", "has_pipes_in_insulated_timber_floor", "has_pipes_in_concrete_slab", "has_boiler", "has_air_source_heat_pump", "has_room_heaters", "has_electric_storage_heaters", "has_warm_air", "has_electric_underfloor_heating", "has_electric_ceiling_heating", "has_community_scheme", "has_ground_source_heat_pump", "has_no_system_present", "has_portable_electric_heaters", "has_water_source_heat_pump", "has_electric_heat_pump", "has_micro-cogeneration", "has_solar_assisted_heat_pump", "has_exhaust_source_heat_pump", "has_community_heat_pump", "has_electric", "has_mains_gas", "has_wood_logs", "has_coal", "has_oil", "has_wood_pellets", "has_anthracite", "has_dual_fuel_mineral_and_wood", "has_smokeless_fuel", "has_lpg", "has_b30k", "has_electricaire", "has_assumed_for_most_rooms", "has_underfloor_heating", "thermostatic_control", "charging_system", "switch_system", "no_control", "dhw_control", "community_heating", "multiple_room_thermostats", "auxiliary_systems", "trvs", "rate_control", "glazing_type", "fuel_type", "main-fuel_tariff_type", "is_community", "no_individual_heating_or_community_network", "complex_fuel_type", ] ENDING_SUFFIX_COMPONENT_COLS = [ "SAP", "HEAT_DEMAND", "CARBON", "TRANSACTION_TYPE", "MECHANICAL_VENTILATION", "SECONDHEAT_DESCRIPTION", "ENERGY_TARIFF", "SOLAR_WATER_HEATING_FLAG", "PHOTO_SUPPLY", "GLAZED_TYPE", "MULTI_GLAZE_PROPORTION", "LOW_ENERGY_LIGHTING", "NUMBER_OPEN_FIREPLACES", "EXTENSION_COUNT", "TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", "DAYS_TO", "walls_thermal_transmittance", "is_park_home", "walls_insulation_thickness", "external_insulation", "internal_insulation", "floor_thermal_transmittance", "floor_insulation_thickness", "roof_thermal_transmittance", "roof_insulation_thickness", "heater_type", "system_type", "thermostat_characteristics", "heating_scope", "energy_recovery", "hotwater_tariff_type", "extra_features", "chp_systems", "distribution_system", "no_system_present", "appliance", "has_radiators", "has_fan_coil_units", "has_pipes_in_screed_above_insulation", "has_pipes_in_insulated_timber_floor", "has_pipes_in_concrete_slab", "has_boiler", "has_air_source_heat_pump", "has_room_heaters", "has_electric_storage_heaters", "has_warm_air", "has_electric_underfloor_heating", "has_electric_ceiling_heating", "has_community_scheme", "has_ground_source_heat_pump", "has_no_system_present", "has_portable_electric_heaters", "has_water_source_heat_pump", "has_electric_heat_pump", "has_micro-cogeneration", "has_solar_assisted_heat_pump", "has_exhaust_source_heat_pump", "has_community_heat_pump", "has_electric", "has_mains_gas", "has_wood_logs", "has_coal", "has_oil", "has_wood_pellets", "has_anthracite", "has_dual_fuel_mineral_and_wood", "has_smokeless_fuel", "has_lpg", "has_b30k", "has_electricaire", "has_assumed_for_most_rooms", "has_underfloor_heating", "thermostatic_control", "charging_system", "switch_system", "no_control", "dhw_control", "community_heating", "multiple_room_thermostats", "auxiliary_systems", "trvs", "rate_control", "glazing_type", "fuel_type", "main-fuel_tariff_type", "is_community", "no_individual_heating_or_community_network", "complex_fuel_type", "estimated_perimeter", ] # We found that without performing any filtering, the bottom 0.5% of homes had a floor height of 1.65m. We'll therefore # filter out any homes with a floor height below this MINIMUM_FLOOR_HEIGHT = 1.65