Refactored Path, organised preprocessing steps for df

This commit is contained in:
Michael Duong 2023-08-09 12:42:57 +00:00
parent 07af58eb85
commit bac9c2e6ae

View file

@ -3,13 +3,21 @@ import os
import pandas as pd
from tqdm import tqdm
from model_data.BaseUtility import BaseUtility
# from BaseUtility import BaseUtility # I need this import as working in different folder
from pathlib import Path
def list_subdirectories(directory_path):
return [d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))]
return [entry for entry in directory_path.iterdir() if entry.is_dir()]
DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'
DATA_DIRECTORY = os.getcwd() + '/model_data/simulation_system/data/all-domestic-certificates'
FULLY_GLAZED_DESCRIPTIONS = [
"Fully double glazed",
"High performance glazing",
"Fully triple glazed",
"Full secondary glazing",
"Multiple glazing throughout",
]
FIXED_FEATURES = [
'PROPERTY_TYPE',
@ -122,20 +130,13 @@ def iterative_filtering(cleaning_averages, property_data):
return filtered_data
def clean_multi_glaze_proportion(df):
fully_glazed_descriptions = [
"Fully double glazed",
"High performance glazing",
"Fully triple glazed",
"Full secondary glazing",
"Multiple glazing throughout",
]
def clean_multi_glaze_proportion(df: pd.DataFrame) -> pd.DataFrame:
"""
If there is no multi-glaze proportion but the windows are fully glazed, then we should assume a score of 100
"""
df["MULTI_GLAZE_PROPORTION"] = np.where(
pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(fully_glazed_descriptions)),
100,
df["MULTI_GLAZE_PROPORTION"],
)
no_multi_glaze_proportion_index = pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
df = df.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100
return df
@ -167,6 +168,59 @@ BUILT_FORM_REMAP = {
}
def confine_data(df: pd.DataFrame) -> pd.DataFrame:
"""
Include all step to reduce down the data based on assumptions
"""
# Filter 1: UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
# Filter 2: Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
# before the introduction of SAP09
# Filter 3: We remove EPCS that were conducted for a new build, since these are performed with
# full SAP, which produces different results to the RdSAP methodology
# Filter 4: We remove floor level in top floor or mid floor since this is ambiguous
df = df[~pd.isnull(df["UPRN"])] \
[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] \
[df["TRANSACTION_TYPE"] != "new dwelling"] \
[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
return df
def retain_multiple_epc_properties(df: pd.DataFrame, minimum_count: int = 1) -> pd.DataFrame:
'''
Reduce the data futher by keeping only datasets with multiple epcs
'''
counts = df.groupby("UPRN").size().reset_index()
counts.columns = ["UPRN", "count"]
# take UPRNS with multiple EPCs
counts = counts[counts["count"] > minimum_count]
df = pd.merge(df, counts, on='UPRN')
return df
def recast_df_columns(df: pd.DataFrame, column_mappings: dict) -> pd.DataFrame:
"""
Recast columns from the dataframe to ensure the behaviour we want
"""
for key, values in column_mappings.items():
if key not in df.columns:
print('Column mapping incorrectly specified')
exit(1)
for value in values:
df[key] = df[key].astype(value)
return df
def app():
# Get all the files in the directory
@ -177,35 +231,20 @@ def app():
dataset = []
for directory in tqdm(directories):
filepath = os.path.join(DATA_DIRECTORY, directory, "certificates.csv")
filepath = directory / "certificates.csv"
df = pd.read_csv(filepath, low_memory=False)
# UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
df = df[~pd.isnull(df["UPRN"])]
# Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
# before the introduction of SAP09
df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
cleaning_averages, general_averages = make_cleaning_averages(df)
# We remove EPCS that were conducted for a new build, since these are performed with
# full SAP, which produces different results to the RdSAP methodology
df = df[df["TRANSACTION_TYPE"] != "new dwelling"]
df = confine_data(df)
df = recast_df_columns(df, {'UPRN': [int, str]})
df = clean_multi_glaze_proportion(df)
df = retain_multiple_epc_properties(df, minimum_count=1)
# We remove floor level in top floor or mid floor since this is ambiguous
df = df[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
df["UPRN"] = df["UPRN"].astype(int).astype(str)
counts = df.groupby("UPRN").size().reset_index()
counts.columns = ["UPRN", "count"]
counts = counts.sort_values("count", ascending=False)
# take UPRNS with multiple EPCs
counts = counts[counts["count"] > 1]
df = df[df["UPRN"].isin(counts["UPRN"])]
df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
cleaning_averages, general_averages = make_cleaning_averages(df)
for uprn, property_data in df.groupby("UPRN", observed=True):
# Fixed features - these are property attributes that shouldn't change over time
@ -305,3 +344,7 @@ def app():
)
dataset.extend(property_model_data)
if __name__ == "__main__":
app()