From 387a19d7cfd76c1e8033c1977986748488adb2d8 Mon Sep 17 00:00:00 2001
From: Michael Duong <michael123ster@gmail.com>
Date: Thu, 10 Aug 2023 20:10:50 +0000
Subject: [PATCH] added a dataprocessor class

---
 model_data/simulation_system/app.py | 187 ++++++++++++++++------------
 1 file changed, 105 insertions(+), 82 deletions(-)

diff --git a/model_data/simulation_system/app.py b/model_data/simulation_system/app.py
index 3ab300f7..62f5d2ff 100644
--- a/model_data/simulation_system/app.py
+++ b/model_data/simulation_system/app.py
@@ -3,8 +3,8 @@ import os
 import pandas as pd
 from tqdm import tqdm
 from model_data.BaseUtility import BaseUtility
-# from BaseUtility import BaseUtility # I need this import as working in different folder
 from pathlib import Path
+from typing import Tuple
 
 def list_subdirectories(directory_path):
     return [entry for entry in directory_path.iterdir() if entry.is_dir()]
@@ -91,56 +91,34 @@ RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
 HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
 
 
-def make_cleaning_averages(df):
-    # Define a custom function to calculate the median, excluding missing values
-    def median_without_missing(group):
-        return group[AVERAGE_FIXED_FEATURES].dropna().median()
-
-    cleaning_averages = df.groupby(
-        ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
-        observed=True
-    ).apply(median_without_missing).reset_index()
-
-    general_averages = df.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True).apply(
-        median_without_missing).reset_index()
-
-    return cleaning_averages, general_averages
-
-
 def iterative_filtering(cleaning_averages, property_data):
+
+
     # Define the columns to filter on
     columns_to_filter = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS",
                          "NUMBER_HEATED_ROOMS"]
+    
+    # Merge datasets together on columns
+    filtered_data = pd.merge(cleaning_averages, property_data.iloc[[-1]], on=columns_to_filter)
 
-    # Start with the entire cleaning_averages DataFrame
-    filtered_data = cleaning_averages.copy()
+    # # Start with the entire cleaning_averages DataFrame
+    # filtered_data = cleaning_averages.copy()
 
-    # Iterate through the columns and apply filters one by one
-    for column in columns_to_filter:
-        # Apply the filter using the value from property_data
-        new_filtered_data = filtered_data[filtered_data[column] == property_data[column].iloc[0]]
+    # # Iterate through the columns and apply filters one by one
+    # for column in columns_to_filter:
+    #     # Apply the filter using the value from property_data
+    #     new_filtered_data = filtered_data[filtered_data[column] == property_data[column].iloc[0]]
 
-        # If the filter results in no data, return the previous result
-        if new_filtered_data.empty:
-            continue
+    #     # If the filter results in no data, return the previous result
+    #     if new_filtered_data.empty:
+    #         continue
 
-        # If the filter is successful, update the filtered data
-        filtered_data = new_filtered_data
+    #     # If the filter is successful, update the filtered data
+    #     filtered_data = new_filtered_data
 
     return filtered_data
 
 
-def clean_multi_glaze_proportion(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    If there is no multi-glaze proportion but the windows are fully glazed, then we should assume a score of 100
-    """
-
-    no_multi_glaze_proportion_index = pd.isnull(df["MULTI_GLAZE_PROPORTION"]) & (df["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
-    df = df.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100
-
-    return df
-
-
 def ordinal(n):
     if 10 <= n % 100 <= 20:
         suffix = 'th'
@@ -167,57 +145,107 @@ BUILT_FORM_REMAP = {
     "Enclosed Mid-Terrace": "Mid-Terrace",
 }
 
+DATA_PROCESSOR_SETTINGS = {
+    'low_memory': False,
+    'epc_minimum_count': 1,
+    'column_mappings': {'UPRN': [int, str]}
+}
 
-def confine_data(df: pd.DataFrame) -> pd.DataFrame:
+class DataProcessor:
     """
-    Include all step to reduce down the data based on assumptions
+    Handle data loading and data preprocessing
     """
 
-    # Filter 1: UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
+    def __init__(self, filepath: Path) -> None:
+        self.filepath = filepath
 
-    # Filter 2: Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
-    # before the introduction of SAP09
+    def load_data(self, low_memory=False) -> None:
+        self.data = pd.read_csv(self.filepath, low_memory=low_memory)
 
-    # Filter 3: We remove EPCS that were conducted for a new build, since these are performed with
-    # full SAP, which produces different results to the RdSAP methodology
+    def process(self) -> pd.DataFrame:
+        """
+        Load all data adnd process data via composition
+        """
+        self.load_data(low_memory=DATA_PROCESSOR_SETTINGS['low_memory'])
+        self.confine_data()
+        self.recast_df_columns(column_mappings=DATA_PROCESSOR_SETTINGS['column_mappings'])
+        self.clean_multi_glaze_proportion()
+        self.retain_multiple_epc_properties(epc_minimum_count=DATA_PROCESSOR_SETTINGS['epc_minimum_count'])
+        
+        self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
 
-    # Filter 4: We remove floor level in top floor or mid floor since this is ambiguous
+        return self.data
+    
+    def make_cleaning_averages(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
+        # Define a custom function to calculate the median, excluding missing values
+        def median_without_missing(group):
+            return group[AVERAGE_FIXED_FEATURES].median(skipna=True)
+
+        cleaning_averages = self.data.groupby(
+            ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"],
+            observed=True
+        ).apply(median_without_missing).reset_index()
+
+        general_averages = self.data.groupby(["PROPERTY_TYPE", "BUILT_FORM"], observed=True).apply(
+            median_without_missing).reset_index()
+
+        return cleaning_averages, general_averages
+
+    def retain_multiple_epc_properties(self, epc_minimum_count: int = 1) -> None:
+        '''
+        Reduce the data futher by keeping only datasets with multiple epcs
+        '''
+
+        counts = self.data.groupby("UPRN").size().reset_index()
+        counts.columns = ["UPRN", "count"]
+
+        # take UPRNS with multiple EPCs
+        counts = counts[counts["count"] > epc_minimum_count]
+        self.data = pd.merge(self.data, counts, on='UPRN')
+
+    
+    def recast_df_columns(self, column_mappings: dict) -> None:
+        """
+        Recast columns from the dataframe to ensure the behaviour we want
+        """
+
+        for key, values in column_mappings.items():
+            if key not in self.data.columns:
+                print('Column mapping incorrectly specified')
+                exit(1)
+            for value in values:
+                self.data[key] = self.data[key].astype(value)
 
 
-    df = df[~pd.isnull(df["UPRN"])] \
-        [df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] \
-        [df["TRANSACTION_TYPE"] != "new dwelling"] \
-        [~df["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
+    def confine_data(self) -> None:
+        """
+        Include all step to reduce down the data based on assumptions
+        """
 
-    return df
+        # Filter 1: UPRN is a unique identifier for a property, so we remove any EPCs that don't have one
 
-def retain_multiple_epc_properties(df: pd.DataFrame, minimum_count: int = 1) -> pd.DataFrame:
-    '''
-    Reduce the data futher by keeping only datasets with multiple epcs
-    '''
+        # Filter 2: Lodgement date is the date the EPC was lodged, so we remove any EPCs that were lodged
+        # before the introduction of SAP09
 
-    counts = df.groupby("UPRN").size().reset_index()
-    counts.columns = ["UPRN", "count"]
+        # Filter 3: We remove EPCS that were conducted for a new build, since these are performed with
+        # full SAP, which produces different results to the RdSAP methodology
 
-    # take UPRNS with multiple EPCs
-    counts = counts[counts["count"] > minimum_count]
-    df = pd.merge(df, counts, on='UPRN')
+        # Filter 4: We remove floor level in top floor or mid floor since this is ambiguous
 
-    return df
 
-def recast_df_columns(df: pd.DataFrame, column_mappings: dict) -> pd.DataFrame:
-    """
-    Recast columns from the dataframe to ensure the behaviour we want
-    """
+        self.data = self.data[~pd.isnull(self.data["UPRN"])] \
+            [self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] \
+            [self.data["TRANSACTION_TYPE"] != "new dwelling"] \
+            [~self.data["FLOOR_LEVEL"].isin(["top floor", "mid floor"])]
 
-    for key, values in column_mappings.items():
-        if key not in df.columns:
-            print('Column mapping incorrectly specified')
-            exit(1)
-        for value in values:
-            df[key] = df[key].astype(value)
+    
+    def clean_multi_glaze_proportion(self) -> None:
+        """
+        If there is no multi-glaze proportion but the windows are fully glazed, then we should assume a score of 100
+        """
 
-    return df
+        no_multi_glaze_proportion_index = pd.isnull(self.data["MULTI_GLAZE_PROPORTION"]) & (self.data["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS))
+        self.data.loc[no_multi_glaze_proportion_index, 'MULTI_GLAZE_PROPORTION'] = 100
 
 
 
@@ -233,22 +261,17 @@ def app():
     for directory in tqdm(directories):
 
         filepath = directory / "certificates.csv"
-        df = pd.read_csv(filepath, low_memory=False)
 
-        df = confine_data(df)
-        df = recast_df_columns(df, {'UPRN': [int, str]})
+        data_processor = DataProcessor(filepath=filepath)
 
-        df = clean_multi_glaze_proportion(df)
-        df = retain_multiple_epc_properties(df, minimum_count=1)
-
-        df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
-
-        cleaning_averages, general_averages = make_cleaning_averages(df)
+        df = data_processor.process()
+        cleaning_averages, general_averages = data_processor.make_cleaning_averages()
 
         for uprn, property_data in df.groupby("UPRN", observed=True):
 
             # Fixed features - these are property attributes that shouldn't change over time
 
+            
             ignore_epc = False
             fixed_data = {}
             for field in FIXED_FEATURES: