Refactored Path, organised preprocessing steps for df

2026-08-02 21:08:24 +00:00 · 2023-08-09 12:42:57 +00:00 · 2023-08-09 12:42:57 +00:00 · bac9c2e6ae
commit bac9c2e6ae
parent 07af58eb85
1 changed files with 81 additions and 38 deletions
--- a/model_data/simulation_system/app.py
+++ b/model_data/simulation_system/app.py
@ -3,13 +3,21 @@ import os
 import pandas as pd
 from tqdm import tqdm
 from model_data.BaseUtility import BaseUtility
-
+# from BaseUtility import BaseUtility # I need this import as working in different folder
+from pathlib import Path

 def list_subdirectories(directory_path):
-    return [d for d in os.listdir(directory_path) if os.path.isdir(os.path.join(directory_path, d))]
+    return [entry for entry in directory_path.iterdir() if entry.is_dir()]

+DATA_DIRECTORY = Path(__file__).parent / 'data' / 'all-domestic-certificates'

-DATA_DIRECTORY = os.getcwd() + '/model_data/simulation_system/data/all-domestic-certificates'
+FULLY_GLAZED_DESCRIPTIONS = [
+    "Fully double glazed",
+    "High performance glazing",
+    "Fully triple glazed",
+    "Full secondary glazing",
+    "Multiple glazing throughout",
+]

 FIXED_FEATURES = [
    'PROPERTY_TYPE',
@ -122,20 +130,13 @@ def iterative_filtering(cleaning_averages, property_data):
    return filtered_data


-def clean_multi_glaze_proportion(df):
-    fully_glazed_descriptions = [
-        "Fully double glazed",
-        "High performance glazing",
-        "Fully triple glazed",
-        "Full secondary glazing",
-        "Multiple glazing throughout",
-    ]
+def clean_multi_glaze_proportion(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    If there is no multi-glaze proportion but the windows are fully glazed, then we should assume a score of 100
+    """

-    df["MULTI_GLAZE_PROPORTION"] = np.where(
-        pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(fully_glazed_descriptions)),
-        100,
-        df["MULTI_GLAZE_PROPORTION"],
-    )
+    no_multi_glaze_proportion_index = pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
+    df = df.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100

    return df

@ -167,6 +168,59 @@ BUILT_FORM_REMAP = {
 }


+def confine_data(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Include all step to reduce down the data based on assumptions
+    """
+
+    # Filter 1: UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
+
+    # Filter 2: Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
+    # before the introduction of SAP09
+
+    # Filter 3: We remove EPCS that were conducted for a new build, since these are performed with
+    # full SAP, which produces different results to the RdSAP methodology
+
+    # Filter 4: We remove floor level in top floor or mid floor since this is ambiguous
+
+
+    df = df[~pd.isnull(df["UPRN"])] \
+        [df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] \
+        [df["TRANSACTION_TYPE"] != "new dwelling"] \
+        [~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
+
+    return df
+
+def retain_multiple_epc_properties(df: pd.DataFrame, minimum_count: int = 1) -> pd.DataFrame:
+    '''
+    Reduce the data futher by keeping only datasets with multiple epcs
+    '''
+
+    counts = df.groupby("UPRN").size().reset_index()
+    counts.columns = ["UPRN", "count"]
+
+    # take UPRNS with multiple EPCs
+    counts = counts[counts["count"] > minimum_count]
+    df = pd.merge(df, counts, on='UPRN')
+
+    return df
+
+def recast_df_columns(df: pd.DataFrame, column_mappings: dict) -> pd.DataFrame:
+    """
+    Recast columns from the dataframe to ensure the behaviour we want
+    """
+
+    for key, values in column_mappings.items():
+        if key not in df.columns:
+            print('Column mapping incorrectly specified')
+            exit(1)
+        for value in values:
+            df[key] = df[key].astype(value)
+
+    return df
+
+
+
 def app():
    # Get all the files in the directory

@ -177,35 +231,20 @@ def app():

    dataset = []
    for directory in tqdm(directories):
-        filepath = os.path.join(DATA_DIRECTORY, directory, "certificates.csv")
+
+        filepath = directory / "certificates.csv"
        df = pd.read_csv(filepath, low_memory=False)
-        # UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
-        df = df[~pd.isnull(df["UPRN"])]
-        # Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
-        # before the introduction of SAP09
-        df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]

-        cleaning_averages, general_averages = make_cleaning_averages(df)
-
-        # We remove EPCS that were conducted for a new build, since these are performed with
-        # full SAP, which produces different results to the RdSAP methodology
-        df = df[df["TRANSACTION_TYPE"] != "new dwelling"]
+        df = confine_data(df)
+        df = recast_df_columns(df, {'UPRN': [int, str]})

        df = clean_multi_glaze_proportion(df)
+        df = retain_multiple_epc_properties(df, minimum_count=1)

-        # We remove floor level in top floor or mid floor since this is ambiguous
-        df = df[~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
-
-        df["UPRN"] = df["UPRN"].astype(int).astype(str)
-        counts = df.groupby("UPRN").size().reset_index()
-        counts.columns = ["UPRN", "count"]
-        counts = counts.sort_values("count", ascending=False)
-
-        # take UPRNS with multiple EPCs
-        counts = counts[counts["count"] > 1]
-        df = df[df["UPRN"].isin(counts["UPRN"])]
        df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)

+        cleaning_averages, general_averages = make_cleaning_averages(df)
+
        for uprn, property_data in df.groupby("UPRN", observed=True):

            # Fixed features - these are property attributes that shouldn't change over time
@ -305,3 +344,7 @@ def app():
                )

            dataset.extend(property_model_data)
+
+
+if __name__ == "__main__":
+    app()