mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
142 lines
7.3 KiB
Python
142 lines
7.3 KiB
Python
from pathlib import Path
|
|
import pandas as pd
|
|
from settings import (
|
|
DATA_PROCESSOR_SETTINGS,
|
|
EARLIEST_EPC_DATE,
|
|
FULLY_GLAZED_DESCRIPTIONS,
|
|
AVERAGE_FIXED_FEATURES,
|
|
FLOOR_HEIGHT_NATIONAL_AVERAGE,
|
|
TOTAL_FLOOR_AREA_NATIONAL_AVERAGE
|
|
)
|
|
|
|
|
|
class DataProcessor:
|
|
"""
|
|
Handle data loading and data preprocessing
|
|
"""
|
|
|
|
def __init__(self, filepath: Path) -> None:
|
|
self.filepath = filepath
|
|
|
|
def load_data(self, low_memory=False) -> None:
|
|
self.data = pd.read_csv(self.filepath, low_memory=low_memory)
|
|
|
|
def pre_process(self) -> pd.DataFrame:
|
|
"""
|
|
Load data and begin initial cleaning
|
|
"""
|
|
self.load_data(low_memory=DATA_PROCESSOR_SETTINGS['low_memory'])
|
|
self.confine_data()
|
|
self.recast_df_columns(column_mappings=DATA_PROCESSOR_SETTINGS['column_mappings'])
|
|
self.clean_multi_glaze_proportion()
|
|
self.retain_multiple_epc_properties(epc_minimum_count=DATA_PROCESSOR_SETTINGS['epc_minimum_count'])
|
|
|
|
self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
|
|
|
|
return self.data
|
|
|
|
def make_cleaning_averages(self) -> pd.DataFrame:
|
|
# Define a custom function to calculate the median, excluding missing values
|
|
def median_without_missing(group):
|
|
return group[AVERAGE_FIXED_FEATURES].median(skipna=True)
|
|
|
|
cleaning_averages = self.data.groupby(
|
|
["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
|
|
observed=True
|
|
).apply(median_without_missing).reset_index()
|
|
|
|
general_averages = self.data.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True).apply(
|
|
median_without_missing).reset_index()
|
|
|
|
property_averages = self.data.groupby(["PROPERTY_TYPE"], observed=True).apply(
|
|
median_without_missing).reset_index()
|
|
|
|
built_form_averages = self.data.groupby(["BUILT_FORM"], observed=True).apply(
|
|
median_without_missing).reset_index()
|
|
|
|
# We can clean up any NA's in the cleaning averages with the general averages here
|
|
cleaning_averages_filled = pd.merge(cleaning_averages, general_averages, on=['PROPERTY_TYPE', 'BUILT_FORM'], suffixes=['', '_AVERAGE'])
|
|
cleaning_averages_filled = pd.merge(cleaning_averages_filled, property_averages, on=['PROPERTY_TYPE'], suffixes=['', '_PROPERTY_AVERAGE'])
|
|
cleaning_averages_filled = pd.merge(cleaning_averages_filled, built_form_averages, on=['BUILT_FORM'], suffixes=['', '_BUILT_FORM_AVERAGE'])
|
|
|
|
# Replace any missing NAN values with averages for the same Property type and built form
|
|
cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_AVERAGE'])
|
|
cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_AVERAGE'])
|
|
cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_AVERAGE', 'FLOOR_HEIGHT_AVERAGE'])
|
|
|
|
# If there are still NA values i.e. the averages do not have values for a speicifc group of property tyope and built form
|
|
# We can use just the property type average and replace
|
|
cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_PROPERTY_AVERAGE'])
|
|
cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_PROPERTY_AVERAGE'])
|
|
cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_PROPERTY_AVERAGE', 'FLOOR_HEIGHT_PROPERTY_AVERAGE'])
|
|
|
|
# If there are still NA values, use BUILT FORM averages
|
|
cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA_BUILT_FORM_AVERAGE'])
|
|
cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT_BUILT_FORM_AVERAGE'])
|
|
cleaning_averages_filled = cleaning_averages_filled.drop(columns=['TOTAL_FLOOR_AREA_BUILT_FORM_AVERAGE', 'FLOOR_HEIGHT_BUILT_FORM_AVERAGE'])
|
|
|
|
# If there still is na values, use average across all properties in consituecy
|
|
cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(cleaning_averages_filled['TOTAL_FLOOR_AREA'].mean())
|
|
cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(cleaning_averages_filled['FLOOR_HEIGHT'].mean())
|
|
|
|
# If the consituency is all NA values, then take UK AVERAGE VALUES
|
|
cleaning_averages_filled['TOTAL_FLOOR_AREA'] = cleaning_averages_filled['TOTAL_FLOOR_AREA'].fillna(TOTAL_FLOOR_AREA_NATIONAL_AVERAGE)
|
|
cleaning_averages_filled['FLOOR_HEIGHT'] = cleaning_averages_filled['FLOOR_HEIGHT'].fillna(FLOOR_HEIGHT_NATIONAL_AVERAGE)
|
|
|
|
return cleaning_averages_filled
|
|
|
|
def retain_multiple_epc_properties(self, epc_minimum_count: int = 1) -> None:
|
|
'''
|
|
Reduce the data futher by keeping only datasets with multiple epcs
|
|
'''
|
|
|
|
counts = self.data.groupby("UPRN").size().reset_index()
|
|
counts.columns = ["UPRN", "count"]
|
|
|
|
# take UPRNS with multiple EPCs
|
|
counts = counts[counts["count"] > epc_minimum_count]
|
|
self.data = pd.merge(self.data, counts, on='UPRN')
|
|
|
|
|
|
def recast_df_columns(self, column_mappings: dict) -> None:
|
|
"""
|
|
Recast columns from the dataframe to ensure the behaviour we want
|
|
"""
|
|
|
|
for key, values in column_mappings.items():
|
|
if key not in self.data.columns:
|
|
print('Column mapping incorrectly specified')
|
|
exit(1)
|
|
for value in values:
|
|
self.data[key] = self.data[key].astype(value)
|
|
|
|
|
|
def confine_data(self) -> None:
|
|
"""
|
|
Include all step to reduce down the data based on assumptions
|
|
"""
|
|
|
|
# Filter 1: UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
|
|
|
|
# Filter 2: Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
|
|
# before the introduction of SAP09
|
|
|
|
# Filter 3: We remove EPCS that were conducted for a new build, since these are performed with
|
|
# full SAP, which produces different results to the RdSAP methodology
|
|
|
|
# Filter 4: We remove floor level in top floor or mid floor since this is ambiguous
|
|
|
|
self.data = self.data[~pd.isnull(self.data["UPRN"])]
|
|
self.data = self.data[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
|
|
self.data = self.data[self.data["TRANSACTION_TYPE"] != "new dwelling"]
|
|
self.data = self.data[~self.data["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
|
|
|
|
|
|
def clean_multi_glaze_proportion(self) -> None:
|
|
"""
|
|
If there is no multi-glaze proportion but the windows are fully glazed, then we should assume a score of 100
|
|
"""
|
|
|
|
no_multi_glaze_proportion_index = pd.isnull(self.data["MULTI_GLAZE_PROPORTION"]) & (self.data["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
|
|
self.data.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100
|
|
|