working on rdsap cleaning

This commit is contained in:
Khalim Conn-Kowlessar 2023-09-15 19:13:14 +01:00
parent 66824eaee7
commit b71e76449f
4 changed files with 129 additions and 84 deletions

View file

@ -14,7 +14,10 @@ from model_data.simulation_system.core.Settings import (
COLUMNS_TO_MERGE_ON, COLUMNS_TO_MERGE_ON,
COMPONENT_FEATURES, COMPONENT_FEATURES,
FIXED_FEATURES, FIXED_FEATURES,
COLUMNTYPES COLUMNTYPES,
RDSAP_RESPONSE,
MAX_SAP_SCORE,
fill_na_map
) )
from typing import List from typing import List
@ -101,10 +104,14 @@ class DataProcessor:
raise NotImplementedError("Not handled the case for value %s" % x) raise NotImplementedError("Not handled the case for value %s" % x)
self.data["CONSTRUCTION_AGE_BAND_CLEANED"] = self.data["CONSTRUCTION_AGE_BAND"].apply( self.data["CONSTRUCTION_AGE_BAND"] = self.data["CONSTRUCTION_AGE_BAND"].apply(
lambda x: clean_construction_age_band(x) lambda x: clean_construction_age_band(x)
) )
self.data = self.data[
~pd.isnull(self.data["CONSTRUCTION_AGE_BAND"])
]
def clean_missing_rooms(self): def clean_missing_rooms(self):
""" """
For the number of heated rooms and number of habitable rooms, we clean these values up front, For the number of heated rooms and number of habitable rooms, we clean these values up front,
@ -132,7 +139,7 @@ class DataProcessor:
for col in ["NUMBER_HEATED_ROOMS", "NUMBER_HABITABLE_ROOMS"]: for col in ["NUMBER_HEATED_ROOMS", "NUMBER_HABITABLE_ROOMS"]:
to_index = 3 to_index = 3
matching_columns = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND_CLEANED", "POSTAL_AREA"] matching_columns = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "POSTAL_AREA"]
has_missings = pd.isnull(self.data[col]).sum() has_missings = pd.isnull(self.data[col]).sum()
while has_missings: while has_missings:
self.data = apply_clean( self.data = apply_clean(
@ -175,6 +182,8 @@ class DataProcessor:
if not self.newdata: if not self.newdata:
self.confine_data() self.confine_data()
self.remap_columns()
# We have some non-standard construction age bands which we'll clean for matching # We have some non-standard construction age bands which we'll clean for matching
self.standardise_construction_age_band() self.standardise_construction_age_band()
self.clean_missing_rooms() self.clean_missing_rooms()
@ -189,7 +198,6 @@ class DataProcessor:
self.retain_multiple_epc_properties( self.retain_multiple_epc_properties(
epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"] epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"]
) )
self.remap_columns()
if DATA_PROCESSOR_SETTINGS["epc_minimum_count"] >= 1: if DATA_PROCESSOR_SETTINGS["epc_minimum_count"] >= 1:
# If we have multiple EPC records, we can try and do filling # If we have multiple EPC records, we can try and do filling
@ -199,8 +207,14 @@ class DataProcessor:
# Final re-casting after data transformed and prepared # Final re-casting after data transformed and prepared
self.data = self.data.astype(COLUMNTYPES) self.data = self.data.astype(COLUMNTYPES)
self.na_remapping()
return self.data return self.data
def na_remapping(self):
for column, fill_value in fill_na_map.items():
self.data[column] = self.data[column].fillna(fill_value)
def fill_na_fields(self, columns_to_fill: List = COLUMNS_TO_MERGE_ON): def fill_na_fields(self, columns_to_fill: List = COLUMNS_TO_MERGE_ON):
""" """
If we have a minimum of 2 epcs, we can do back fill and forward fill on certain data fields If we have a minimum of 2 epcs, we can do back fill and forward fill on certain data fields
@ -305,62 +319,43 @@ class DataProcessor:
suffixes=["", "_BUILT_FORM_AVERAGE"], suffixes=["", "_BUILT_FORM_AVERAGE"],
) )
# Replace any missing NAN values with averages for the same Property type and built form for variable in AVERAGE_FIXED_FEATURES:
cleaning_averages_filled["TOTAL_FLOOR_AREA"] = cleaning_averages_filled[ # Replace any missing NAN values with averages for the same Property type and built form
"TOTAL_FLOOR_AREA" cleaning_averages_filled[variable] = cleaning_averages_filled[variable].fillna(
].fillna(cleaning_averages_filled["TOTAL_FLOOR_AREA_AVERAGE"]) cleaning_averages_filled[f"{variable}_AVERAGE"]
cleaning_averages_filled["FLOOR_HEIGHT"] = cleaning_averages_filled[ )
"FLOOR_HEIGHT"
].fillna(cleaning_averages_filled["FLOOR_HEIGHT_AVERAGE"])
cleaning_averages_filled = cleaning_averages_filled.drop(
columns=["TOTAL_FLOOR_AREA_AVERAGE", "FLOOR_HEIGHT_AVERAGE"]
)
# If there are still NA values i.e. the averages do not have values for a speicifc group of property tyope cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_AVERAGE")
# and built form
# We can use just the property type average and replace
cleaning_averages_filled["TOTAL_FLOOR_AREA"] = cleaning_averages_filled[
"TOTAL_FLOOR_AREA"
].fillna(cleaning_averages_filled["TOTAL_FLOOR_AREA_PROPERTY_AVERAGE"])
cleaning_averages_filled["FLOOR_HEIGHT"] = cleaning_averages_filled[
"FLOOR_HEIGHT"
].fillna(cleaning_averages_filled["FLOOR_HEIGHT_PROPERTY_AVERAGE"])
cleaning_averages_filled = cleaning_averages_filled.drop(
columns=[
"TOTAL_FLOOR_AREA_PROPERTY_AVERAGE",
"FLOOR_HEIGHT_PROPERTY_AVERAGE",
]
)
# If there are still NA values, use BUILT FORM averages # If there are still NA values i.e. the averages do not have values for a speicifc group of property tyope
cleaning_averages_filled["TOTAL_FLOOR_AREA"] = cleaning_averages_filled[ # and built form
"TOTAL_FLOOR_AREA" # We can use just the property type average and replace
].fillna(cleaning_averages_filled["TOTAL_FLOOR_AREA_BUILT_FORM_AVERAGE"])
cleaning_averages_filled["FLOOR_HEIGHT"] = cleaning_averages_filled[
"FLOOR_HEIGHT"
].fillna(cleaning_averages_filled["FLOOR_HEIGHT_BUILT_FORM_AVERAGE"])
cleaning_averages_filled = cleaning_averages_filled.drop(
columns=[
"TOTAL_FLOOR_AREA_BUILT_FORM_AVERAGE",
"FLOOR_HEIGHT_BUILT_FORM_AVERAGE",
]
)
# If there still is na values, use average across all properties in consituecy cleaning_averages_filled[variable] = cleaning_averages_filled[variable].fillna(
cleaning_averages_filled["TOTAL_FLOOR_AREA"] = cleaning_averages_filled[ cleaning_averages_filled[f"{variable}_PROPERTY_AVERAGE"]
"TOTAL_FLOOR_AREA" )
].fillna(cleaning_averages_filled["TOTAL_FLOOR_AREA"].mean())
cleaning_averages_filled["FLOOR_HEIGHT"] = cleaning_averages_filled[ cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_PROPERTY_AVERAGE")
"FLOOR_HEIGHT"
].fillna(cleaning_averages_filled["FLOOR_HEIGHT"].mean()) # If there are still NA values, use BUILT FORM averages
cleaning_averages_filled["variable"] = cleaning_averages_filled[variable].fillna(
cleaning_averages_filled[f"{variable}_BUILT_FORM_AVERAGE"]
)
cleaning_averages_filled = cleaning_averages_filled.drop(columns=f"{variable}_BUILT_FORM_AVERAGE")
# If there still is na values, use average across all properties in consituecy
cleaning_averages_filled[variable] = cleaning_averages_filled[
variable
].fillna(cleaning_averages_filled[variable].mean())
# If the consituency is all NA values, then take UK AVERAGE VALUES # If the consituency is all NA values, then take UK AVERAGE VALUES
cleaning_averages_filled["TOTAL_FLOOR_AREA"] = cleaning_averages_filled[ # cleaning_averages_filled["TOTAL_FLOOR_AREA"] = cleaning_averages_filled[
"TOTAL_FLOOR_AREA" # "TOTAL_FLOOR_AREA"
].fillna(TOTAL_FLOOR_AREA_NATIONAL_AVERAGE) # ].fillna(TOTAL_FLOOR_AREA_NATIONAL_AVERAGE)
cleaning_averages_filled["FLOOR_HEIGHT"] = cleaning_averages_filled[ # cleaning_averages_filled["FLOOR_HEIGHT"] = cleaning_averages_filled[
"FLOOR_HEIGHT" # "FLOOR_HEIGHT"
].fillna(FLOOR_HEIGHT_NATIONAL_AVERAGE) # ].fillna(FLOOR_HEIGHT_NATIONAL_AVERAGE)
return cleaning_averages_filled return cleaning_averages_filled
@ -402,12 +397,23 @@ class DataProcessor:
# Filter 4: We remove floor level in top floor or mid floor since this is ambiguous # Filter 4: We remove floor level in top floor or mid floor since this is ambiguous
# Filter 5: Remove any EPCs with a SAP score above 100
# Filter 6: We found a small number of cases that have missing window description so we drop these
# Filter 7: We found a small number of cases that have missing hotwater description so we drop these
self.data = self.data[~pd.isnull(self.data["UPRN"])] self.data = self.data[~pd.isnull(self.data["UPRN"])]
self.data = self.data[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] self.data = self.data[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
self.data = self.data[self.data["TRANSACTION_TYPE"] != "new dwelling"] self.data = self.data[self.data["TRANSACTION_TYPE"] != "new dwelling"]
self.data = self.data[ self.data = self.data[
~self.data["FLOOR_LEVEL"].isin(["top floor", "mid floor"]) ~self.data["FLOOR_LEVEL"].isin(["top floor", "mid floor"])
] ]
self.data = self.data[self.data[RDSAP_RESPONSE] <= MAX_SAP_SCORE]
# We observed 7 final records with missing windows and 2 records with missing hot water so we shall remove them
self.data = self.data[~pd.isnull(self.data["WINDOWS_DESCRIPTION"])]
self.data = self.data[~pd.isnull(self.data["HOTWATER_DESCRIPTION"])]
def clean_multi_glaze_proportion(self) -> None: def clean_multi_glaze_proportion(self) -> None:
""" """
@ -437,6 +443,12 @@ class DataProcessor:
differs depending on where the function is being used. differs depending on where the function is being used.
:return: Cleaned DataFrame. :return: Cleaned DataFrame.
""" """
cols_to_clean = [
c for c in ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT", "FIXED_LIGHTING_OUTLETS_COUNT"] if
c in data_to_clean.columns
]
# Enforce data types # Enforce data types
for col in ["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"]: for col in ["NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"]:
data_to_clean[col] = data_to_clean[col].astype(float) data_to_clean[col] = data_to_clean[col].astype(float)
@ -445,10 +457,9 @@ class DataProcessor:
columns_to_merge_on = data_to_clean[cols_to_merge_on].dropna().columns.tolist() columns_to_merge_on = data_to_clean[cols_to_merge_on].dropna().columns.tolist()
# Calculate averages # Calculate averages
cleaning_averages_to_merge = cleaning_data.groupby(columns_to_merge_on).agg({ cleaning_averages_to_merge = cleaning_data.groupby(columns_to_merge_on).agg(
"TOTAL_FLOOR_AREA": "mean", dict(zip(cols_to_clean, ["mean", ] * len(cols_to_clean)))
"FLOOR_HEIGHT": "mean" )
})
# Merge with the original data # Merge with the original data
data_to_clean = pd.merge( data_to_clean = pd.merge(
@ -460,7 +471,7 @@ class DataProcessor:
) )
# Fill NaN values with averages # Fill NaN values with averages
for col in ["TOTAL_FLOOR_AREA", "FLOOR_HEIGHT"]: for col in cols_to_clean:
data_to_clean[col].fillna(data_to_clean[f"{col}_AVERAGE"], inplace=True) data_to_clean[col].fillna(data_to_clean[f"{col}_AVERAGE"], inplace=True)
data_to_clean.drop(columns=[f"{col}_AVERAGE"], inplace=True) data_to_clean.drop(columns=[f"{col}_AVERAGE"], inplace=True)

View file

@ -55,7 +55,8 @@ FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45
AVERAGE_FIXED_FEATURES = [ AVERAGE_FIXED_FEATURES = [
"TOTAL_FLOOR_AREA", "TOTAL_FLOOR_AREA",
"FLOOR_HEIGHT" "FLOOR_HEIGHT",
"FIXED_LIGHTING_OUTLETS_COUNT",
] ]
COLUMNS_TO_MERGE_ON = [ COLUMNS_TO_MERGE_ON = [
@ -82,8 +83,7 @@ FIXED_FEATURES = [
"CONSTITUENCY", "CONSTITUENCY",
"NUMBER_HEATED_ROOMS", "NUMBER_HEATED_ROOMS",
"FIXED_LIGHTING_OUTLETS_COUNT", "FIXED_LIGHTING_OUTLETS_COUNT",
"FLOOR_HEIGHT", "FLOOR_HEIGHT"
"FLOOR_LEVEL",
"TOTAL_FLOOR_AREA", "TOTAL_FLOOR_AREA",
] ]
@ -120,7 +120,6 @@ LATEST_FIELD = [
"NUMBER_HABITABLE_ROOMS", "NUMBER_HABITABLE_ROOMS",
"NUMBER_HEATED_ROOMS", "NUMBER_HEATED_ROOMS",
"FIXED_LIGHTING_OUTLETS_COUNT", "FIXED_LIGHTING_OUTLETS_COUNT",
"FLOOR_LEVEL",
"CONSTRUCTION_AGE_BAND", # This is a field we're probably want to use verisk data for "CONSTRUCTION_AGE_BAND", # This is a field we're probably want to use verisk data for
] ]
@ -173,7 +172,7 @@ DATA_PROCESSOR_SETTINGS = {
COLUMNTYPES = { COLUMNTYPES = {
'UPRN': 'object', 'TOTAL_FLOOR_AREA': 'float64', 'FLOOR_HEIGHT': 'float64', 'PROPERTY_TYPE': 'object', 'UPRN': 'object', 'TOTAL_FLOOR_AREA': 'float64', 'FLOOR_HEIGHT': 'float64', 'PROPERTY_TYPE': 'object',
'BUILT_FORM': 'object', 'CONSTITUENCY': 'object', 'NUMBER_HABITABLE_ROOMS': 'float64', 'BUILT_FORM': 'object', 'CONSTITUENCY': 'object', 'NUMBER_HABITABLE_ROOMS': 'float64',
'NUMBER_HEATED_ROOMS': 'float64', 'FIXED_LIGHTING_OUTLETS_COUNT': 'float64', 'FLOOR_LEVEL': 'float64', 'NUMBER_HEATED_ROOMS': 'float64', 'FIXED_LIGHTING_OUTLETS_COUNT': 'float64',
'CONSTRUCTION_AGE_BAND': 'object', 'CONSTRUCTION_AGE_BAND': 'object',
'TRANSACTION_TYPE': 'object', 'TRANSACTION_TYPE': 'object',
'WALLS_DESCRIPTION': 'object', 'WALLS_DESCRIPTION': 'object',
@ -194,3 +193,22 @@ COLUMNTYPES = {
'EXTENSION_COUNT': 'float64', 'EXTENSION_COUNT': 'float64',
'LODGEMENT_DATE': 'object', 'LODGEMENT_DATE': 'object',
} }
# For modelling, we don't allow records with more than 100 SAP points
MAX_SAP_SCORE = 100
fill_na_map = {
# There are some descriptions, such as "To be used only when there is no heating/hot-water system or data is from
# a community network" that could be clustered with unknown fuel
"MAIN_FUEL": "UNKNOWN",
"MECHANICAL_VENTILATION": "Unknown",
"SECONDHEAT_DESCRIPTION": "None",
"ENERGY_TARIFF": "Unknown",
# We set solar water heating flag to N - we could investigate using a different category entirely
"SOLAR_WATER_HEATING_FLAG": "N",
"GLAZED_TYPE": "not defined",
"MULTI_GLAZE_PROPORTION": 0,
"LOW_ENERGY_LIGHTING": 0,
"MAINHEATCONT_DESCRIPTION": "Unknown",
"EXTENSION_COUNT": 0,
}

View file

@ -14,7 +14,7 @@ from model_data.simulation_system.core.Settings import (
CARBON_RESPONSE, CARBON_RESPONSE,
) )
from model_data.simulation_system.core.DataProcessor import DataProcessor from model_data.simulation_system.core.DataProcessor import DataProcessor
from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3 from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3, read_dataframe_from_s3_parquet
DATA_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates" DATA_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates"
@ -92,7 +92,6 @@ def process_and_prune_desriptions(df, cleaned_lookup):
'is_assumed_ENDING', 'has_dwelling_above_ENDING', 'is_valid_ENDING', 'is_assumed_ENDING', 'has_dwelling_above_ENDING', 'is_valid_ENDING',
'insulation_thickness_ENDING', 'insulation_thickness_ENDING',
] ]
} }
for component in ["walls", "floor", "roof"]: for component in ["walls", "floor", "roof"]:
@ -172,22 +171,6 @@ def app():
dataset = [] dataset = []
cleaning_dataset = [] cleaning_dataset = []
# TODO [x] : Does energy tariff make a difference
# - leave for now but it may not
# TODO: [x] : Add starting SAP and head demand as a feature
# TODO [x] : If SAP hasn't changed, we don't include the record
# TODO [x]: If SAP gets worse, it genuinely looks like in the vast majority of cases that the building looks
# worse in the newer epc, so we can switch the orders
# TODO [x] : Have a look at temporal features
# TODO [x] : Floor area will impact the EPC so instead of averaging, we should have a starting and ending value.
# TODO [x]: Same as floor area for floor height
# TODO [x]: If fundamental building fabric changes, we should proabably discard the record
# TODO [x]: Should we prune records that have an exceptionally large amount of time between them?
# - leave for now and check performance after temporal features
# TODO [x]: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections?
# - Leave for now
#
for directory in tqdm(directories): for directory in tqdm(directories):
filepath = directory / "certificates.csv" filepath = directory / "certificates.csv"
@ -199,7 +182,6 @@ def app():
data_by_urpn = [] data_by_urpn = []
for uprn, property_data in df.groupby("UPRN", observed=True): for uprn, property_data in df.groupby("UPRN", observed=True):
# Fixed features - these are property attributes that shouldn't change over time # Fixed features - these are property attributes that shouldn't change over time
fixed_data = {} fixed_data = {}
@ -224,6 +206,13 @@ def app():
fixed_data.update(mandatory_field_data) fixed_data.update(mandatory_field_data)
fixed_data.update(latest_field_data) fixed_data.update(latest_field_data)
# Apply cleaning to fixed_data
fixed_data = DataProcessor.apply_averages_cleaning(
data_to_clean=pd.DataFrame([fixed_data]),
cleaning_data=cleaning_averages,
cols_to_merge_on=COLUMNS_TO_MERGE_ON
).to_dict("records")[0]
# We include the lodgement date here as we probably need to factor time into the # We include the lodgement date here as we probably need to factor time into the
# model, since EPC standards and rigour have changed over time # model, since EPC standards and rigour have changed over time
variable_data = modified_property_data[ variable_data = modified_property_data[
@ -289,6 +278,7 @@ def app():
data_by_urpn.extend(property_model_data) data_by_urpn.extend(property_model_data)
data_by_urpn_df = pd.DataFrame(data_by_urpn) data_by_urpn_df = pd.DataFrame(data_by_urpn)
# Add some temporal features - we look at the days from the standard starting point in time # Add some temporal features - we look at the days from the standard starting point in time
# for the starting and ending date so all records are from a fixed point # for the starting and ending date so all records are from a fixed point
data_by_urpn_df["DAYS_TO_STARTING"] = ( data_by_urpn_df["DAYS_TO_STARTING"] = (
@ -308,6 +298,9 @@ def app():
# is low # is low
# We also replace descriptions with their cleaned variants # We also replace descriptions with their cleaned variants
if pd.isnull(data_by_urpn_df).sum().sum():
raise ValueError("Null values found in dataset")
data_by_urpn_df = process_and_prune_desriptions(data_by_urpn_df, cleaned_lookup) data_by_urpn_df = process_and_prune_desriptions(data_by_urpn_df, cleaned_lookup)
dataset.append(data_by_urpn_df) dataset.append(data_by_urpn_df)

View file

@ -1,6 +1,7 @@
import boto3 import boto3
from io import BytesIO from io import BytesIO
from botocore.exceptions import NoCredentialsError, PartialCredentialsError from botocore.exceptions import NoCredentialsError, PartialCredentialsError
import pandas as pd
def read_from_s3(bucket_name, s3_file_name): def read_from_s3(bucket_name, s3_file_name):
@ -63,3 +64,25 @@ def save_dataframe_to_s3_parquet(df, bucket_name, file_key):
# Upload the Parquet file to S3 # Upload the Parquet file to S3
client.put_object(Bucket=bucket_name, Key=file_key, Body=parquet_buffer.getvalue()) client.put_object(Bucket=bucket_name, Key=file_key, Body=parquet_buffer.getvalue())
def read_dataframe_from_s3_parquet(bucket_name, file_key):
"""
Read a pandas DataFrame from a Parquet file stored in S3.
:param bucket_name: Name of the S3 bucket.
:param file_key: Key of the file (including directory path within the bucket).
:return: A pandas DataFrame.
"""
# Create the boto3 client
client = boto3.client('s3')
# Get the Parquet file from S3
response = client.get_object(Bucket=bucket_name, Key=file_key)
# Read the file into a pandas DataFrame
parquet_buffer = BytesIO(response['Body'].read())
df = pd.read_parquet(parquet_buffer)
return df