mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
536 lines
17 KiB
Python
536 lines
17 KiB
Python
# Using a simply python file as settings for now
|
||
# TODO: migrate to dynaconf
|
||
from pathlib import Path
|
||
|
||
DATA_ANOMALY_MATCHES = {
|
||
# Invalid reports are where the value provided is out of bounds, e.g. a negative energy rating of -1199 or a
|
||
# non-integer, there is no valid energy band for this, so it is marked as INVALID!
|
||
"INVALID",
|
||
"INVALID!",
|
||
# When the energy certificate was first lodged on the register there was no requirement to lodge this data
|
||
# item, i.e. a non-mandatory item.
|
||
"NO DATA!",
|
||
"NODATA!",
|
||
# When the energy certificate was first lodged on the register there was no requirement to lodge this data item,
|
||
# i.e.a non - mandatory item.
|
||
"N/A",
|
||
# A value generated by the register to account for a data item that was not mandatory when the lodgement of
|
||
# the energy certificate occurred. When the data item became mandatory the register operator, for backwards
|
||
# compatibility purposes, populated the data field with a value of ‘not recorded’ to ensure that the energy
|
||
# certificate retrieval process is successfully completed. Mandatory data items cannot be applied
|
||
# retrospectively to energy certificates lodged before the date of the change.
|
||
"Not recorded",
|
||
# The data also contains DECs with an operational rating of ‘9999’ (a ‘default’ DEC). The production of a
|
||
# ‘default’ DEC value was allowed to enable building occupiers, with poor quality or no energy data,
|
||
# the opportunity to comply with the regulations. From April 2011 the ability to lodge a ‘default’ DEC was no
|
||
# longer allowed.
|
||
"9999",
|
||
# The Building Emission Rate (BER) data field for non-domestic buildings may contain a ‘blank’ value. The BER
|
||
# was only lodged on the register from 7 March 2010.
|
||
"Blank"
|
||
# There are currently just over 8,600 records where the local authority identifier is ‘null’. This is due to
|
||
# the Register Operator not being able to match the building address in the Markermap Ordinance Survey (GB)
|
||
# lookup tables or OS MasterMap Address Layer 2 data. The majority of these addresses have been requested
|
||
# manually by energy assessors for inclusion by the Register Operator in the registers (e.g. new builds,
|
||
# etc). These records are being published for completeness. An ongoing process to manage these manually added
|
||
# addresses will take time to develop to deal with these and future anomalies.
|
||
#
|
||
# There are several fields within the lodged data where it is possible to enter multiple entries to cater for
|
||
# different data_types of build within a single property, i.e. extensions. This results in multiple entries for
|
||
# the description fields for floor, roof and wall. For the purposes of this data release only the information
|
||
# contained within the first of these multiple entries is being provided. As there are no restrictions on the
|
||
# value in this first field it means that sometimes the first field in a multiple entry description field may
|
||
# contain a ‘null’ value. A resolution to correct these anomalies will be considered for future data releases.
|
||
"NULL",
|
||
# We sometimes see fields populated with just an empty string.
|
||
"",
|
||
# We sometimes find None values - particulatly when we produce an estimated EPC
|
||
None,
|
||
# An older value which rarely shows up but has been seen in the data.
|
||
"UNKNOWN",
|
||
}
|
||
|
||
DATA_ANOMALY_SUBSTRINGS = {
|
||
# Where values in a ‘pick’ list that have been superseded by another value. For example, where a value for
|
||
# ‘pitched roof’ has been replaced by three sub-categories of pitched roof. The original value is retained
|
||
# but ‘for backward compatibility only’ it is appended to ensure that the energy certificate retrieval
|
||
# process can be successfully completed. Replacement data items cannot be applied retrospectively to energy
|
||
# certificates lodged on the register before the date of the change.
|
||
"for backward compatibility only"
|
||
}
|
||
|
||
METRIC_FILENAME = "metrics.csv"
|
||
|
||
OPTIMISE_METRIC = "mean_absolute_error"
|
||
BEST_MODEL_COLUMN_NAME = "best_model"
|
||
|
||
# TODO: remove these setting elsewhere for CML
|
||
RESIDUAL_TRUE_LABEL = "true"
|
||
RESIDUAL_PREDICTION_LABEL = "pred"
|
||
RESIDUAL_FILE = "residual.png"
|
||
SEABORN_RESIDUAL_AXIS_FONTSIZE = 12
|
||
SEABORN_RESIDUAL_TITLE_FONTSIZE = 22
|
||
SEABORN_RESIDUAL_STYLE = "whitegrid"
|
||
SEABORN_RESIDUAL_ASPECT_RATIO = "equal"
|
||
SEABORN_RESIDUAL_PLOT_DPI = 120
|
||
SEABORN_RESIDUAL_RANGE = [-100, 100]
|
||
SEABORN_RESIDUAL_LINE_COLOUR = "black"
|
||
SEABORN_RESIDUAL_LINE_WIDTH = 1
|
||
|
||
# Can move to a hyperparmeters file
|
||
# If anything we might want to have a file that can be loaded and sent to this script
|
||
MODEL_HYPERPARAMETERS = {
|
||
"autogluon": {
|
||
"problem_type": "regression",
|
||
"eval_metric": "mean_absolute_error",
|
||
"time_limit": 45,
|
||
"presets": "medium_quality",
|
||
"excluded_model_types": None,
|
||
}
|
||
}
|
||
|
||
TIMESTAMP_FORMAT = "%Y_%m_%d_%H_%M_%S"
|
||
|
||
RANDOM_SEED = 0
|
||
SUBSAMPLE_FACTOR = 200
|
||
|
||
TRAIN_AND_VALIDATION_DATA_NAME = "train_validation_data.parquet"
|
||
TEST_DATA_NAME = "test_data.parquet"
|
||
|
||
REGISTRY_FILE = "model_registry.csv"
|
||
MODEL_DIRECTORY = "model_directory"
|
||
BASE_REGISTRY_PATH = Path(__file__).parent.parent / MODEL_DIRECTORY
|
||
PREDICTION_LOCATION = Path("predictions")
|
||
PREDICTION_FILE = "prediction.json"
|
||
METADATA_FILE = "metadata.json"
|
||
MODEL_FOLDER = "model"
|
||
METRICS_FOLDER = "metrics"
|
||
DEPLOYMENT_FOLDER = "deployment"
|
||
|
||
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
|
||
FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
|
||
|
||
COST_FEATURES = [
|
||
"LIGHTING_COST_CURRENT",
|
||
"HEATING_COST_CURRENT",
|
||
"HOT_WATER_COST_CURRENT",
|
||
]
|
||
|
||
AVERAGE_FIXED_FEATURES = [
|
||
"TOTAL_FLOOR_AREA",
|
||
"FLOOR_HEIGHT",
|
||
"FIXED_LIGHTING_OUTLETS_COUNT",
|
||
]
|
||
|
||
COLUMNS_TO_MERGE_ON = [
|
||
"PROPERTY_TYPE",
|
||
"BUILT_FORM",
|
||
"CONSTRUCTION_AGE_BAND",
|
||
"NUMBER_HABITABLE_ROOMS",
|
||
"NUMBER_HEATED_ROOMS",
|
||
]
|
||
|
||
FULLY_GLAZED_DESCRIPTIONS = [
|
||
"Fully double glazed",
|
||
"High performance glazing",
|
||
"Fully triple glazed",
|
||
"Full secondary glazing",
|
||
"Multiple glazing throughout",
|
||
]
|
||
|
||
FIXED_FEATURES = [
|
||
"PROPERTY_TYPE",
|
||
"BUILT_FORM",
|
||
"CONSTRUCTION_AGE_BAND",
|
||
"NUMBER_HABITABLE_ROOMS",
|
||
"CONSTITUENCY",
|
||
"NUMBER_HEATED_ROOMS",
|
||
"FIXED_LIGHTING_OUTLETS_COUNT",
|
||
]
|
||
|
||
CORE_COMPONENT_FEATURES = [
|
||
"WALLS_DESCRIPTION",
|
||
"FLOOR_DESCRIPTION",
|
||
"LIGHTING_DESCRIPTION",
|
||
"ROOF_DESCRIPTION",
|
||
"MAINHEAT_DESCRIPTION",
|
||
"HOTWATER_DESCRIPTION",
|
||
"MAIN_FUEL",
|
||
"MECHANICAL_VENTILATION",
|
||
"SECONDHEAT_DESCRIPTION",
|
||
"WINDOWS_DESCRIPTION",
|
||
"GLAZED_TYPE",
|
||
"MULTI_GLAZE_PROPORTION",
|
||
"LOW_ENERGY_LIGHTING",
|
||
"NUMBER_OPEN_FIREPLACES",
|
||
"MAINHEATCONT_DESCRIPTION",
|
||
"SOLAR_WATER_HEATING_FLAG",
|
||
"PHOTO_SUPPLY",
|
||
]
|
||
|
||
EFFICIENCY_FEATURES = [
|
||
"HOT_WATER_ENERGY_EFF",
|
||
"FLOOR_ENERGY_EFF",
|
||
"WINDOWS_ENERGY_EFF",
|
||
"WALLS_ENERGY_EFF",
|
||
"SHEATING_ENERGY_EFF",
|
||
"ROOF_ENERGY_EFF",
|
||
"MAINHEAT_ENERGY_EFF",
|
||
"MAINHEATC_ENERGY_EFF",
|
||
"LIGHTING_ENERGY_EFF",
|
||
]
|
||
|
||
ROOM_FEATURES = ["number_habitable_rooms", "number_heated_rooms"]
|
||
|
||
|
||
COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [
|
||
"TRANSACTION_TYPE",
|
||
"ENERGY_TARIFF", # Not sure if this is relevant
|
||
"EXTENSION_COUNT",
|
||
"TOTAL_FLOOR_AREA",
|
||
"FLOOR_HEIGHT",
|
||
# 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION
|
||
]
|
||
|
||
POTENTIAL_COLUMNS = [
|
||
"POTENTIAL_ENERGY_EFFICIENCY",
|
||
"ENVIRONMENT_IMPACT_POTENTIAL",
|
||
"ENERGY_CONSUMPTION_POTENTIAL",
|
||
"CO2_EMISSIONS_POTENTIAL",
|
||
# We don't include cost features for the moment
|
||
# 'LIGHTING_COST_POTENTIAL',
|
||
# 'HEATING_COST_POTENTIAL',
|
||
# 'HOT_WATER_COST_POTENTIAL'
|
||
]
|
||
|
||
# For these fields, we take the latest value if we have multiple values
|
||
# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
|
||
# the most accurate
|
||
LATEST_FIELD = [
|
||
"NUMBER_HABITABLE_ROOMS",
|
||
"NUMBER_HEATED_ROOMS",
|
||
"FIXED_LIGHTING_OUTLETS_COUNT",
|
||
"CONSTRUCTION_AGE_BAND", # This is a field we're probably want to use verisk data for
|
||
]
|
||
|
||
# If we see thee features changing, we don't use the EPC, since deem it not to be reliable
|
||
MANDATORY_FIXED_FEATURES = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTITUENCY"]
|
||
|
||
# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
|
||
# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
|
||
# and Wales from 31 July 2014
|
||
EARLIEST_EPC_DATE = "2014-08-01"
|
||
|
||
IGNORED_TRANSACTION_TYPES = "new dwelling"
|
||
IGNORED_FLOOR_LEVELS = ["top floor", "mid floor"]
|
||
IGNORED_PROPERTY_TYPES = "Park home"
|
||
IGNORED_TENURES = [
|
||
"Not defined - use in the case of a new dwelling for which the intended tenure in not known. It is not to be used "
|
||
"for an existing dwelling"
|
||
]
|
||
|
||
RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
|
||
HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
|
||
CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT"
|
||
|
||
BUILT_FORM_REMAP = {
|
||
"Enclosed End-Terrace": "End-Terrace",
|
||
"Enclosed Mid-Terrace": "Mid-Terrace",
|
||
}
|
||
|
||
DATA_PROCESSOR_SETTINGS = {
|
||
"low_memory": False,
|
||
"epc_minimum_count": 1,
|
||
"column_mappings": {"UPRN": [int, str]},
|
||
}
|
||
|
||
# This has a manual mapping of the column types required
|
||
COLUMNTYPES = {
|
||
"UPRN": "object",
|
||
"TOTAL_FLOOR_AREA": "float64",
|
||
"FLOOR_HEIGHT": "float64",
|
||
"PROPERTY_TYPE": "object",
|
||
"BUILT_FORM": "object",
|
||
"CONSTITUENCY": "object",
|
||
"NUMBER_HABITABLE_ROOMS": "float64",
|
||
"NUMBER_HEATED_ROOMS": "float64",
|
||
"FIXED_LIGHTING_OUTLETS_COUNT": "float64",
|
||
"CONSTRUCTION_AGE_BAND": "object",
|
||
"TRANSACTION_TYPE": "object",
|
||
"WALLS_DESCRIPTION": "object",
|
||
"FLOOR_DESCRIPTION": "object",
|
||
"LIGHTING_DESCRIPTION": "object",
|
||
"ROOF_DESCRIPTION": "object",
|
||
"MAINHEAT_DESCRIPTION": "object",
|
||
"HOTWATER_DESCRIPTION": "object",
|
||
"MAIN_FUEL": "object",
|
||
"MECHANICAL_VENTILATION": "object",
|
||
"SECONDHEAT_DESCRIPTION": "object",
|
||
"ENERGY_TARIFF": "object",
|
||
"SOLAR_WATER_HEATING_FLAG": "object",
|
||
"PHOTO_SUPPLY": "float64",
|
||
"WINDOWS_DESCRIPTION": "object",
|
||
"GLAZED_TYPE": "object",
|
||
"MULTI_GLAZE_PROPORTION": "float64",
|
||
"LOW_ENERGY_LIGHTING": "float64",
|
||
"NUMBER_OPEN_FIREPLACES": "float64",
|
||
"MAINHEATCONT_DESCRIPTION": "object",
|
||
"EXTENSION_COUNT": "float64",
|
||
"LODGEMENT_DATE": "object",
|
||
**dict(
|
||
zip(
|
||
EFFICIENCY_FEATURES,
|
||
[
|
||
"object",
|
||
]
|
||
* len(EFFICIENCY_FEATURES),
|
||
)
|
||
),
|
||
**dict(
|
||
zip(
|
||
POTENTIAL_COLUMNS,
|
||
[
|
||
"float64",
|
||
]
|
||
* len(POTENTIAL_COLUMNS),
|
||
)
|
||
),
|
||
}
|
||
|
||
# For modelling, we don't allow records with more than 100 SAP points
|
||
MAX_SAP_SCORE = 100
|
||
|
||
fill_na_map = {
|
||
# There are some descriptions, such as "To be used only when there is no heating/hot-water system or data is from
|
||
# a community network" that could be clustered with unknown fuel
|
||
"MAIN_FUEL": "UNKNOWN",
|
||
"MECHANICAL_VENTILATION": "Unknown",
|
||
"SECONDHEAT_DESCRIPTION": "None",
|
||
"ENERGY_TARIFF": "Unknown",
|
||
# We set solar water heating flag to N - we could investigate using a different category entirely
|
||
"SOLAR_WATER_HEATING_FLAG": "N",
|
||
"GLAZED_TYPE": "not defined",
|
||
"MULTI_GLAZE_PROPORTION": 0,
|
||
"LOW_ENERGY_LIGHTING": 0,
|
||
"MAINHEATCONT_DESCRIPTION": "Unknown",
|
||
"EXTENSION_COUNT": 0,
|
||
"NUMBER_OPEN_FIREPLACES": 0,
|
||
}
|
||
|
||
################################################################################################
|
||
# These are the features we need for scoring
|
||
# We'll likely change how we do this in the future
|
||
################################################################################################
|
||
|
||
STARTING_SUFFIX_COMPONENT_COLS = [
|
||
"SAP",
|
||
"HEAT_DEMAND",
|
||
"CARBON",
|
||
"TRANSACTION_TYPE",
|
||
"MECHANICAL_VENTILATION",
|
||
"SECONDHEAT_DESCRIPTION",
|
||
"ENERGY_TARIFF",
|
||
"SOLAR_WATER_HEATING_FLAG",
|
||
"PHOTO_SUPPLY",
|
||
"GLAZED_TYPE",
|
||
"MULTI_GLAZE_PROPORTION",
|
||
"LOW_ENERGY_LIGHTING",
|
||
"NUMBER_OPEN_FIREPLACES",
|
||
"EXTENSION_COUNT",
|
||
"TOTAL_FLOOR_AREA",
|
||
"FLOOR_HEIGHT",
|
||
"DAYS_TO",
|
||
"estimated_perimeter",
|
||
]
|
||
NO_SUFFIX_COMPONENT_COLS = [
|
||
"walls_thermal_transmittance",
|
||
"is_cavity_wall",
|
||
"is_filled_cavity",
|
||
"is_solid_brick",
|
||
"is_system_built",
|
||
"is_timber_frame",
|
||
"is_granite_or_whinstone",
|
||
"is_as_built",
|
||
"is_cob",
|
||
"is_sandstone_or_limestone",
|
||
"is_park_home",
|
||
"walls_insulation_thickness",
|
||
"external_insulation",
|
||
"internal_insulation",
|
||
"floor_thermal_transmittance",
|
||
"is_to_unheated_space",
|
||
"is_to_external_air",
|
||
"is_suspended",
|
||
"is_solid",
|
||
"another_property_below",
|
||
"floor_insulation_thickness",
|
||
"roof_thermal_transmittance",
|
||
"is_pitched",
|
||
"is_roof_room",
|
||
"is_loft",
|
||
"is_flat",
|
||
"is_thatched",
|
||
"is_at_rafters",
|
||
"has_dwelling_above",
|
||
"roof_insulation_thickness",
|
||
"heater_type",
|
||
"system_type",
|
||
"thermostat_characteristics",
|
||
"heating_scope",
|
||
"energy_recovery",
|
||
"hotwater_tariff_type",
|
||
"extra_features",
|
||
"chp_systems",
|
||
"distribution_system",
|
||
"no_system_present",
|
||
"appliance",
|
||
"has_radiators",
|
||
"has_fan_coil_units",
|
||
"has_pipes_in_screed_above_insulation",
|
||
"has_pipes_in_insulated_timber_floor",
|
||
"has_pipes_in_concrete_slab",
|
||
"has_boiler",
|
||
"has_air_source_heat_pump",
|
||
"has_room_heaters",
|
||
"has_electric_storage_heaters",
|
||
"has_warm_air",
|
||
"has_electric_underfloor_heating",
|
||
"has_electric_ceiling_heating",
|
||
"has_community_scheme",
|
||
"has_ground_source_heat_pump",
|
||
"has_no_system_present",
|
||
"has_portable_electric_heaters",
|
||
"has_water_source_heat_pump",
|
||
"has_electric_heat_pump",
|
||
"has_micro-cogeneration",
|
||
"has_solar_assisted_heat_pump",
|
||
"has_exhaust_source_heat_pump",
|
||
"has_community_heat_pump",
|
||
"has_electric",
|
||
"has_mains_gas",
|
||
"has_wood_logs",
|
||
"has_coal",
|
||
"has_oil",
|
||
"has_wood_pellets",
|
||
"has_anthracite",
|
||
"has_dual_fuel_mineral_and_wood",
|
||
"has_smokeless_fuel",
|
||
"has_lpg",
|
||
"has_b30k",
|
||
"has_electricaire",
|
||
"has_assumed_for_most_rooms",
|
||
"has_underfloor_heating",
|
||
"thermostatic_control",
|
||
"charging_system",
|
||
"switch_system",
|
||
"no_control",
|
||
"dhw_control",
|
||
"community_heating",
|
||
"multiple_room_thermostats",
|
||
"auxiliary_systems",
|
||
"trvs",
|
||
"rate_control",
|
||
"glazing_type",
|
||
"fuel_type",
|
||
"main-fuel_tariff_type",
|
||
"is_community",
|
||
"no_individual_heating_or_community_network",
|
||
"complex_fuel_type",
|
||
]
|
||
|
||
ENDING_SUFFIX_COMPONENT_COLS = [
|
||
"SAP",
|
||
"HEAT_DEMAND",
|
||
"CARBON",
|
||
"TRANSACTION_TYPE",
|
||
"MECHANICAL_VENTILATION",
|
||
"SECONDHEAT_DESCRIPTION",
|
||
"ENERGY_TARIFF",
|
||
"SOLAR_WATER_HEATING_FLAG",
|
||
"PHOTO_SUPPLY",
|
||
"GLAZED_TYPE",
|
||
"MULTI_GLAZE_PROPORTION",
|
||
"LOW_ENERGY_LIGHTING",
|
||
"NUMBER_OPEN_FIREPLACES",
|
||
"EXTENSION_COUNT",
|
||
"TOTAL_FLOOR_AREA",
|
||
"FLOOR_HEIGHT",
|
||
"DAYS_TO",
|
||
"walls_thermal_transmittance",
|
||
"is_park_home",
|
||
"walls_insulation_thickness",
|
||
"external_insulation",
|
||
"internal_insulation",
|
||
"floor_thermal_transmittance",
|
||
"floor_insulation_thickness",
|
||
"roof_thermal_transmittance",
|
||
"roof_insulation_thickness",
|
||
"heater_type",
|
||
"system_type",
|
||
"thermostat_characteristics",
|
||
"heating_scope",
|
||
"energy_recovery",
|
||
"hotwater_tariff_type",
|
||
"extra_features",
|
||
"chp_systems",
|
||
"distribution_system",
|
||
"no_system_present",
|
||
"appliance",
|
||
"has_radiators",
|
||
"has_fan_coil_units",
|
||
"has_pipes_in_screed_above_insulation",
|
||
"has_pipes_in_insulated_timber_floor",
|
||
"has_pipes_in_concrete_slab",
|
||
"has_boiler",
|
||
"has_air_source_heat_pump",
|
||
"has_room_heaters",
|
||
"has_electric_storage_heaters",
|
||
"has_warm_air",
|
||
"has_electric_underfloor_heating",
|
||
"has_electric_ceiling_heating",
|
||
"has_community_scheme",
|
||
"has_ground_source_heat_pump",
|
||
"has_no_system_present",
|
||
"has_portable_electric_heaters",
|
||
"has_water_source_heat_pump",
|
||
"has_electric_heat_pump",
|
||
"has_micro-cogeneration",
|
||
"has_solar_assisted_heat_pump",
|
||
"has_exhaust_source_heat_pump",
|
||
"has_community_heat_pump",
|
||
"has_electric",
|
||
"has_mains_gas",
|
||
"has_wood_logs",
|
||
"has_coal",
|
||
"has_oil",
|
||
"has_wood_pellets",
|
||
"has_anthracite",
|
||
"has_dual_fuel_mineral_and_wood",
|
||
"has_smokeless_fuel",
|
||
"has_lpg",
|
||
"has_b30k",
|
||
"has_electricaire",
|
||
"has_assumed_for_most_rooms",
|
||
"has_underfloor_heating",
|
||
"thermostatic_control",
|
||
"charging_system",
|
||
"switch_system",
|
||
"no_control",
|
||
"dhw_control",
|
||
"community_heating",
|
||
"multiple_room_thermostats",
|
||
"auxiliary_systems",
|
||
"trvs",
|
||
"rate_control",
|
||
"glazing_type",
|
||
"fuel_type",
|
||
"main-fuel_tariff_type",
|
||
"is_community",
|
||
"no_individual_heating_or_community_network",
|
||
"complex_fuel_type",
|
||
"estimated_perimeter",
|
||
]
|
||
|
||
# We found that without performing any filtering, the bottom 0.5% of homes had a floor height of 1.65m. We'll therefore
|
||
# filter out any homes with a floor height below this
|
||
MINIMUM_FLOOR_HEIGHT = 1.65
|