Add seperated class for Dataprocessing - need to move all code into it eventually

This commit is contained in:
Michael Duong 2023-08-12 11:35:01 +00:00
parent 9fe640dd50
commit 92736084e8
4 changed files with 319 additions and 260 deletions

View file

@ -0,0 +1,142 @@
from pathlib import Path
import pandas as pd
from settings import (
DATA_PROCESSOR_SETTINGS,
EARLIEST_EPC_DATE,
FULLY_GLAZED_DESCRIPTIONS,
AVERAGE_FIXED_FEATURES,
FLOOR_HEIGHT_NATIONAL_AVERAGE,
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE
)
class DataProcessor:
"""
Handle data loading and data preprocessing
"""
def __init__(self, filepath: Path) -> None:
self.filepath = filepath
def load_data(self, low_memory=False) -> None:
self.data = pd.read_csv(self.filepath, low_memory=low_memory)
def pre_process(self) -> pd.DataFrame:
"""
Load data and begin initial cleaning
"""
self.load_data(low_memory=DATA_PROCESSOR_SETTINGS['low_memory'])
self.confine_data()
self.recast_df_columns(column_mappings=DATA_PROCESSOR_SETTINGS['column_mappings'])
self.clean_multi_glaze_proportion()
self.retain_multiple_epc_properties(epc_minimum_count=DATA_PROCESSOR_SETTINGS['epc_minimum_count'])
self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
return self.data
def make_cleaning_averages(self) -> pd.DataFrame:
# Define a custom function to calculate the median, excluding missing values
def median_without_missing(group):
return group[AVERAGE_FIXED_FEATURES].median(skipna=True)
cleaning_averages = self.data.groupby(
["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
observed=True
).apply(median_without_missing).reset_index()
general_averages = self.data.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True).apply(
median_without_missing).reset_index()
property_averages = self.data.groupby(["PROPERTY_TYPE"], observed=True).apply(
median_without_missing).reset_index()
built_form_averages = self.data.groupby(["BUILT_FORM"], observed=True).apply(
median_without_missing).reset_index()
# We can clean up any NA's in the cleaning averages with the general averages here
cleaning_averages_filled = pd.merge(cleaning_averages, general_averages, on=['PROPERTY_TYPE', 'BUILT_FORM'], suffixes=['', '_AVERAGE'])
cleaning_averages_filled = pd.merge(cleaning_averages_filled, property_averages, on=['PROPERTY_TYPE'], suffixes=['', '_PROPERTY_AVERAGE'])
cleaning_averages_filled = pd.merge(cleaning_averages_filled, built_form_averages, on=['BUILT_FORM'], suffixes=['', '_BUILT_FORM_AVERAGE'])
# Replace any missing NAN values with averages for the same Property type and built form
cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_AVERAGE'])
cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_AVERAGE'])
cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE'])
# If there are still NA values i.e. the averages do not have values for a speicifc group of property tyope and built form
# We can use just the property type average and replace
cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_PROPERTY_AVERAGE'])
cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_PROPERTY_AVERAGE'])
cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_PROPERTY_AVERAGE', 'FLOOR_HEIGHT_PROPERTY_AVERAGE'])
# If there are still NA values, use BUILT FORM averages
cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_BUILT_FORM_AVERAGE'])
cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_BUILT_FORM_AVERAGE'])
cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_BUILT_FORM_AVERAGE', 'FLOOR_HEIGHT_BUILT_FORM_AVERAGE'])
# If there still is na values, use average across all properties in consituecy
cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA'].mean())
cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT'].mean())
# If the consituency is all NA values, then take UK AVERAGE VALUES
cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(TOTAL_FLOOR_AREA_NATIONAL_AVERAGE)
cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(FLOOR_HEIGHT_NATIONAL_AVERAGE)
return cleaning_averages_filled
def retain_multiple_epc_properties(self, epc_minimum_count: int = 1) -> None:
'''
Reduce the data futher by keeping only datasets with multiple epcs
'''
counts = self.data.groupby("UPRN").size().reset_index()
counts.columns = ["UPRN", "count"]
# take UPRNS with multiple EPCs
counts = counts[counts["count"] > epc_minimum_count]
self.data = pd.merge(self.data, counts, on='UPRN')
def recast_df_columns(self, column_mappings: dict) -> None:
"""
Recast columns from the dataframe to ensure the behaviour we want
"""
for key, values in column_mappings.items():
if key not in self.data.columns:
print('Column mapping incorrectly specified')
exit(1)
for value in values:
self.data[key] = self.data[key].astype(value)
def confine_data(self) -> None:
"""
Include all step to reduce down the data based on assumptions
"""
# Filter 1: UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
# Filter 2: Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
# before the introduction of SAP09
# Filter 3: We remove EPCS that were conducted for a new build, since these are performed with
# full SAP, which produces different results to the RdSAP methodology
# Filter 4: We remove floor level in top floor or mid floor since this is ambiguous
self.data = self.data[~pd.isnull(self.data["UPRN"])]
self.data = self.data[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
self.data = self.data[self.data["TRANSACTION_TYPE"] != "new dwelling"]
self.data = self.data[~self.data["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
def clean_multi_glaze_proportion(self) -> None:
"""
If there is no multi-glaze proportion but the windows are fully glazed, then we should assume a score of 100
"""
no_multi_glaze_proportion_index = pd.isnull(self.data["MULTI_GLAZE_PROPORTION"]) & (self.data["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
self.data.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100

View file

@ -1,269 +1,42 @@
import numpy as np
import os
import pandas as pd
from tqdm import tqdm
from model_data.BaseUtility import BaseUtility
from pathlib import Path
from typing import Tuple
def list_subdirectories(directory_path):
return [entry for entry in directory_path.iterdir() if entry.is_dir()]
from settings import (
MANDATORY_FIXED_FEATURES,
AVERAGE_FIXED_FEATURES,
LATEST_FIELD,
COMPONENT_FEATURES,
RDSAP_RESPONSE,
HEAT_DEMAND_RESPONSE,
FLOOR_LEVEL_MAP,
BUILT_FORM_REMAP
)
from DataProcessor import DataProcessor
DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'
FULLY_GLAZED_DESCRIPTIONS = [
"Fully double glazed",
"High performance glazing",
"Fully triple glazed",
"Full secondary glazing",
"Multiple glazing throughout",
]
FIXED_FEATURES = [
'PROPERTY_TYPE',
'BUILT_FORM',
'CONSTRUCTION_AGE_BAND',
'NUMBER_HABITABLE_ROOMS',
'CONSTITUENCY',
'NUMBER_HEATED_ROOMS',
'FIXED_LIGHTING_OUTLETS_COUNT',
'FLOOR_HEIGHT',
'FLOOR_LEVEL',
'TOTAL_FLOOR_AREA',
]
COMPONENT_FEATURES = [
'TRANSACTION_TYPE',
'WALLS_DESCRIPTION',
'FLOOR_DESCRIPTION',
'LIGHTING_DESCRIPTION',
'ROOF_DESCRIPTION',
'MAINHEAT_DESCRIPTION',
'HOTWATER_DESCRIPTION',
'MAIN_FUEL',
'MECHANICAL_VENTILATION',
'SECONDHEAT_DESCRIPTION',
'ENERGY_TARIFF', # Not sure if this is relevant
'SOLAR_WATER_HEATING_FLAG',
'PHOTO_SUPPLY',
'WINDOWS_DESCRIPTION',
'GLAZED_TYPE',
'MULTI_GLAZE_PROPORTION',
'LIGHTING_DESCRIPTION',
'LOW_ENERGY_LIGHTING',
'NUMBER_OPEN_FIREPLACES',
'MAINHEATCONT_DESCRIPTION',
'EXTENSION_COUNT',
# 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION
]
# For these fields, we take an average if we have multiple values
AVERAGE_FIXED_FEATURES = [
"TOTAL_FLOOR_AREA",
"FLOOR_HEIGHT"
]
# For these fields, we take the latest value if we have multiple values
# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
# the most accurate
LATEST_FIELD = [
"NUMBER_HABITABLE_ROOMS",
"NUMBER_HEATED_ROOMS",
"FIXED_LIGHTING_OUTLETS_COUNT",
"FLOOR_LEVEL",
"CONSTRUCTION_AGE_BAND", # This is a field we're probably want to use verisk data for
]
# If we see thee features changing, we don't use the EPC, since deem it not to be reliable
MANDATORY_FIXED_FEATURES = [
"PROPERTY_TYPE",
"BUILT_FORM",
"CONSTITUENCY"
]
# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
# and Wales from 31 July 2014
EARLIEST_EPC_DATE = "2014-08-01"
RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
def iterative_filtering(cleaning_averages, property_data):
# Define the columns to filter on
columns_to_filter = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS",
"NUMBER_HEATED_ROOMS"]
# Merge datasets together on columns
filtered_data = pd.merge(cleaning_averages, property_data.iloc[[-1]], on=columns_to_filter)
# # Start with the entire cleaning_averages DataFrame
# filtered_data = cleaning_averages.copy()
# # Iterate through the columns and apply filters one by one
# for column in columns_to_filter:
# # Apply the filter using the value from property_data
# new_filtered_data = filtered_data[filtered_data[column] == property_data[column].iloc[0]]
# # If the filter results in no data, return the previous result
# if new_filtered_data.empty:
# continue
# # If the filter is successful, update the filtered data
# filtered_data = new_filtered_data
return filtered_data
def ordinal(n):
if 10 <= n % 100 <= 20:
suffix = 'th'
else:
suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
return str(n) + suffix
FLOOR_LEVEL_MAP = {
"Basement": -1,
"Ground": 0,
"ground floor": 0,
"20+": 20,
"21st or above": 21,
**{str(i).zfill(2): i for i in range(0, 21)},
**{ordinal(i): i for i in range(-1, 21)},
**{str(i): i for i in range(-1, 21)},
**{i: i for i in range(-1, 21)},
}
BUILT_FORM_REMAP = {
"Enclosed End-Terrace": "End-Terrace",
"Enclosed Mid-Terrace": "Mid-Terrace",
}
DATA_PROCESSOR_SETTINGS = {
'low_memory': False,
'epc_minimum_count': 1,
'column_mappings': {'UPRN': [int, str]}
}
class DataProcessor:
"""
Handle data loading and data preprocessing
"""
def __init__(self, filepath: Path) -> None:
self.filepath = filepath
def load_data(self, low_memory=False) -> None:
self.data = pd.read_csv(self.filepath, low_memory=low_memory)
def process(self) -> pd.DataFrame:
"""
Load all data adnd process data via composition
"""
self.load_data(low_memory=DATA_PROCESSOR_SETTINGS['low_memory'])
self.confine_data()
self.recast_df_columns(column_mappings=DATA_PROCESSOR_SETTINGS['column_mappings'])
self.clean_multi_glaze_proportion()
self.retain_multiple_epc_properties(epc_minimum_count=DATA_PROCESSOR_SETTINGS['epc_minimum_count'])
self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
return self.data
def make_cleaning_averages(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
# Define a custom function to calculate the median, excluding missing values
def median_without_missing(group):
return group[AVERAGE_FIXED_FEATURES].median(skipna=True)
cleaning_averages = self.data.groupby(
["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
observed=True
).apply(median_without_missing).reset_index()
general_averages = self.data.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True).apply(
median_without_missing).reset_index()
return cleaning_averages, general_averages
def retain_multiple_epc_properties(self, epc_minimum_count: int = 1) -> None:
'''
Reduce the data futher by keeping only datasets with multiple epcs
'''
counts = self.data.groupby("UPRN").size().reset_index()
counts.columns = ["UPRN", "count"]
# take UPRNS with multiple EPCs
counts = counts[counts["count"] > epc_minimum_count]
self.data = pd.merge(self.data, counts, on='UPRN')
def recast_df_columns(self, column_mappings: dict) -> None:
"""
Recast columns from the dataframe to ensure the behaviour we want
"""
for key, values in column_mappings.items():
if key not in self.data.columns:
print('Column mapping incorrectly specified')
exit(1)
for value in values:
self.data[key] = self.data[key].astype(value)
def confine_data(self) -> None:
"""
Include all step to reduce down the data based on assumptions
"""
# Filter 1: UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
# Filter 2: Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
# before the introduction of SAP09
# Filter 3: We remove EPCS that were conducted for a new build, since these are performed with
# full SAP, which produces different results to the RdSAP methodology
# Filter 4: We remove floor level in top floor or mid floor since this is ambiguous
self.data = self.data[~pd.isnull(self.data["UPRN"])]
self.data = self.data[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
self.data = self.data[self.data["TRANSACTION_TYPE"] != "new dwelling"]
self.data = self.data[~self.data["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
def clean_multi_glaze_proportion(self) -> None:
"""
If there is no multi-glaze proportion but the windows are fully glazed, then we should assume a score of 100
"""
no_multi_glaze_proportion_index = pd.isnull(self.data["MULTI_GLAZE_PROPORTION"]) & (self.data["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
self.data.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100
def app():
# Get all the files in the directory
# Data glossary:
# https://epc.opendatacommunities.org/docs/guidance#glossary
directories = list_subdirectories(DATA_DIRECTORY)
# List all subdirectories
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
dataset = []
for directory in tqdm(directories):
filepath = directory / "certificates.csv"
data_processor = DataProcessor(filepath=filepath)
df = data_processor.process()
cleaning_averages, general_averages = data_processor.make_cleaning_averages()
df = data_processor.pre_process()
cleaning_averages = data_processor.make_cleaning_averages()
for uprn, property_data in df.groupby("UPRN", observed=True):
@ -280,44 +53,51 @@ def app():
# If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
if max(modified_property_data[MANDATORY_FIXED_FEATURES].nunique()) > 1:
continue
mandatory_field_data = modified_property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()
# Remap certain columns
modified_property_data['FLOOR_LEVEL'] = modified_property_data['FLOOR_LEVEL'].replace(FLOOR_LEVEL_MAP)
modified_property_data['BUILT_FROM'] = modified_property_data['BUILT_FORM'].replace(BUILT_FORM_REMAP)
# Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS
latest_field_data = modified_property_data[LATEST_FIELD].iloc[-1].to_dict()
mandatory_field_data = modified_property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()
# Taking just the last row, which is the percentage change from the latest to previous one only
# modified_property_data[AVERAGE_FIXED_FEATURES].fillna(value=0).pct_change().iloc[-1] > 0.1
# We can replace any NA values for Average fixed features
# We have columns that we want to merge on, but some of these columns are all NA values
# So we determine which columns to merge on, and get the equivalent grouping in the averages
columns_to_merge_on = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS",
"NUMBER_HEATED_ROOMS"]
if any(modified_property_data[columns_to_merge_on].isna()):
# If there are any NA value, back fill first (i.e most recent), then forward fill if needed
modified_property_data[columns_to_merge_on] = modified_property_data[columns_to_merge_on].fillna(method='bfill').fillna(method='ffill')
# Extract the columns that are non all None
na_columns = modified_property_data[columns_to_merge_on].isna().all()
columns_to_merge_on = na_columns.index[~na_columns].to_list()
# Get the corresponding groupby and merge, and fill in NA values
cleaning_averages_to_merge = cleaning_averages.groupby(columns_to_merge_on)[['TOTAL_FLOOR_AREA', 'FLOOR_HEIGHT']].mean()
modified_property_data = pd.merge(modified_property_data, cleaning_averages_to_merge, on=columns_to_merge_on, suffixes=['', '_AVERAGE'])
modified_property_data['TOTAL_FLOOR_AREA'] = modified_property_data['TOTAL_FLOOR_AREA'].fillna(modified_property_data['TOTAL_FLOOR_AREA_AVERAGE'])
modified_property_data['FLOOR_HEIGHT'] = modified_property_data['FLOOR_HEIGHT'].fillna(modified_property_data['FLOOR_HEIGHT_AVERAGE'])
modified_property_data = modified_property_data.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE'])
for field in AVERAGE_FIXED_FEATURES:
vals = list(modified_property_data[field].dropna().unique())
if len(vals) > 1:
# Check the values are too far apart
# TODO: we could have multiple values here, why only use the first two?
if abs(vals[0] - vals[1]) / vals[0] > 0.1:
# Take the more recent value since it's likely to be more accurate
vals = [vals[-1]]
if vals:
field_value = np.mean(vals)
else:
# Clean using averages
avgs = iterative_filtering(cleaning_averages, modified_property_data)
# TODO: Should probably do a mean/median?
field_value = avgs[field].iloc[0]
if pd.isnull(field_value):
# Just the use the general averages
field_value = general_averages[
(general_averages["PROPERTY_TYPE"] == modified_property_data["PROPERTY_TYPE"].iloc[0]) &
(general_averages["BUILT_FORM"] == modified_property_data["BUILT_FORM"].iloc[0])
][field].iloc[0]
fixed_data[field] = field_value
#Combine all fields together
@ -369,6 +149,9 @@ def app():
dataset.extend(property_model_data)
output = pd.DataFrame(dataset)
output.to_parquet('./dataset.parquet')
if __name__ == "__main__":
app()

View file

@ -0,0 +1,114 @@
# Using a simply python file as settings for now
# TODO: migrate to dynaconf
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70
FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
FULLY_GLAZED_DESCRIPTIONS = [
"Fully double glazed",
"High performance glazing",
"Fully triple glazed",
"Full secondary glazing",
"Multiple glazing throughout",
]
FIXED_FEATURES = [
'PROPERTY_TYPE',
'BUILT_FORM',
'CONSTRUCTION_AGE_BAND',
'NUMBER_HABITABLE_ROOMS',
'CONSTITUENCY',
'NUMBER_HEATED_ROOMS',
'FIXED_LIGHTING_OUTLETS_COUNT',
'FLOOR_HEIGHT',
'FLOOR_LEVEL',
'TOTAL_FLOOR_AREA',
]
COMPONENT_FEATURES = [
'TRANSACTION_TYPE',
'WALLS_DESCRIPTION',
'FLOOR_DESCRIPTION',
'LIGHTING_DESCRIPTION',
'ROOF_DESCRIPTION',
'MAINHEAT_DESCRIPTION',
'HOTWATER_DESCRIPTION',
'MAIN_FUEL',
'MECHANICAL_VENTILATION',
'SECONDHEAT_DESCRIPTION',
'ENERGY_TARIFF', # Not sure if this is relevant
'SOLAR_WATER_HEATING_FLAG',
'PHOTO_SUPPLY',
'WINDOWS_DESCRIPTION',
'GLAZED_TYPE',
'MULTI_GLAZE_PROPORTION',
'LIGHTING_DESCRIPTION',
'LOW_ENERGY_LIGHTING',
'NUMBER_OPEN_FIREPLACES',
'MAINHEATCONT_DESCRIPTION',
'EXTENSION_COUNT',
# 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION
]
# For these fields, we take an average if we have multiple values
AVERAGE_FIXED_FEATURES = [
"TOTAL_FLOOR_AREA",
"FLOOR_HEIGHT"
]
# For these fields, we take the latest value if we have multiple values
# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
# the most accurate
LATEST_FIELD = [
"NUMBER_HABITABLE_ROOMS",
"NUMBER_HEATED_ROOMS",
"FIXED_LIGHTING_OUTLETS_COUNT",
"FLOOR_LEVEL",
"CONSTRUCTION_AGE_BAND", # This is a field we're probably want to use verisk data for
]
# If we see thee features changing, we don't use the EPC, since deem it not to be reliable
MANDATORY_FIXED_FEATURES = [
"PROPERTY_TYPE",
"BUILT_FORM",
"CONSTITUENCY"
]
# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
# and Wales from 31 July 2014
EARLIEST_EPC_DATE = "2014-08-01"
RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
def ordinal(n):
if 10 <= n % 100 <= 20:
suffix = 'th'
else:
suffix = {1: 'st', 2: 'nd', 3: 'rd'}.get(n % 10, 'th')
return str(n) + suffix
FLOOR_LEVEL_MAP = {
"Basement": -1,
"Ground": 0,
"ground floor": 0,
"20+": 20,
"21st or above": 21,
**{str(i).zfill(2): i for i in range(0, 21)},
**{ordinal(i): i for i in range(-1, 21)},
**{str(i): i for i in range(-1, 21)},
**{i: i for i in range(-1, 21)},
}
BUILT_FORM_REMAP = {
"Enclosed End-Terrace": "End-Terrace",
"Enclosed Mid-Terrace": "Mid-Terrace",
}
DATA_PROCESSOR_SETTINGS = {
'low_memory': False,
'epc_minimum_count': 1,
'column_mappings': {'UPRN': [int, str]}
}

View file

@ -0,0 +1,20 @@
import os
from logging import Logger
logger = Logger(__name__)
def training():
"""
Pipeline to run training on the dataset
"""
logger.info('Loading data')
logger.info('Feature selection')
logger.info('Build Model')
logger.info('Evaluate matrics')
if __name__ == "__main__":
training()