diff --git a/model_data/simulation_system/app.py b/model_data/simulation_system/app.py index 545fae16..3ab300f7 100644 --- a/model_data/simulation_system/app.py +++ b/model_data/simulation_system/app.py @@ -3,13 +3,21 @@ import os import pandas as pd from tqdm import tqdm from model_data.BaseUtility import BaseUtility - +# from BaseUtility import BaseUtility # I need this import as working in different folder +from pathlib import Path def list_subdirectories(directory_path): - return [d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))] + return [entry for entry in directory_path.iterdir() if entry.is_dir()] +DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates' -DATA_DIRECTORY = os.getcwd() + '/model_data/simulation_system/data/all-domestic-certificates' +FULLY_GLAZED_DESCRIPTIONS = [ + "Fully double glazed", + "High performance glazing", + "Fully triple glazed", + "Full secondary glazing", + "Multiple glazing throughout", +] FIXED_FEATURES = [ 'PROPERTY_TYPE', @@ -122,20 +130,13 @@ def iterative_filtering(cleaning_averages, property_data): return filtered_data -def clean_multi_glaze_proportion(df): - fully_glazed_descriptions = [ - "Fully double glazed", - "High performance glazing", - "Fully triple glazed", - "Full secondary glazing", - "Multiple glazing throughout", - ] +def clean_multi_glaze_proportion(df: pd.DataFrame) -> pd.DataFrame: + """ + If there is no multi-glaze proportion but the windows are fully glazed, then we should assume a score of 100 + """ - df["MULTI_GLAZE_PROPORTION"] = np.where( - pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(fully_glazed_descriptions)), - 100, - df["MULTI_GLAZE_PROPORTION"], - ) + no_multi_glaze_proportion_index = pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS)) + df = df.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100 return df @@ -167,6 +168,59 @@ BUILT_FORM_REMAP = { } +def confine_data(df: pd.DataFrame) -> pd.DataFrame: + """ + Include all step to reduce down the data based on assumptions + """ + + # Filter 1: UPRN is a unique identifier for a property, so we remove any EPCs that don't have one + + # Filter 2: Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged + # before the introduction of SAP09 + + # Filter 3: We remove EPCS that were conducted for a new build, since these are performed with + # full SAP, which produces different results to the RdSAP methodology + + # Filter 4: We remove floor level in top floor or mid floor since this is ambiguous + + + df = df[~pd.isnull(df["UPRN"])] \ + [df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] \ + [df["TRANSACTION_TYPE"] != "new dwelling"] \ + [~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])] + + return df + +def retain_multiple_epc_properties(df: pd.DataFrame, minimum_count: int = 1) -> pd.DataFrame: + ''' + Reduce the data futher by keeping only datasets with multiple epcs + ''' + + counts = df.groupby("UPRN").size().reset_index() + counts.columns = ["UPRN", "count"] + + # take UPRNS with multiple EPCs + counts = counts[counts["count"] > minimum_count] + df = pd.merge(df, counts, on='UPRN') + + return df + +def recast_df_columns(df: pd.DataFrame, column_mappings: dict) -> pd.DataFrame: + """ + Recast columns from the dataframe to ensure the behaviour we want + """ + + for key, values in column_mappings.items(): + if key not in df.columns: + print('Column mapping incorrectly specified') + exit(1) + for value in values: + df[key] = df[key].astype(value) + + return df + + + def app(): # Get all the files in the directory @@ -177,35 +231,20 @@ def app(): dataset = [] for directory in tqdm(directories): - filepath = os.path.join(DATA_DIRECTORY, directory, "certificates.csv") + + filepath = directory / "certificates.csv" df = pd.read_csv(filepath, low_memory=False) - # UPRN is a unique identifier for a property, so we remove any EPCs that don't have one - df = df[~pd.isnull(df["UPRN"])] - # Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged - # before the introduction of SAP09 - df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] - cleaning_averages, general_averages = make_cleaning_averages(df) - - # We remove EPCS that were conducted for a new build, since these are performed with - # full SAP, which produces different results to the RdSAP methodology - df = df[df["TRANSACTION_TYPE"] != "new dwelling"] + df = confine_data(df) + df = recast_df_columns(df, {'UPRN': [int, str]}) df = clean_multi_glaze_proportion(df) + df = retain_multiple_epc_properties(df, minimum_count=1) - # We remove floor level in top floor or mid floor since this is ambiguous - df = df[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])] - - df["UPRN"] = df["UPRN"].astype(int).astype(str) - counts = df.groupby("UPRN").size().reset_index() - counts.columns = ["UPRN", "count"] - counts = counts.sort_values("count", ascending=False) - - # take UPRNS with multiple EPCs - counts = counts[counts["count"] > 1] - df = df[df["UPRN"].isin(counts["UPRN"])] df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True) + cleaning_averages, general_averages = make_cleaning_averages(df) + for uprn, property_data in df.groupby("UPRN", observed=True): # Fixed features - these are property attributes that shouldn't change over time @@ -305,3 +344,7 @@ def app(): ) dataset.extend(property_model_data) + + +if __name__ == "__main__": + app() \ No newline at end of file