From 4102d23063072e2f5eeba2bb58e680f48c765695 Mon Sep 17 00:00:00 2001
From: Michael Duong <michaelduong@Michaels-MacBook-Pro.local>
Date: Sat, 2 Dec 2023 11:24:01 +0000
Subject: [PATCH] testing a change

---
 etl/epc/DataProcessor.py       | 39 +++++++++++++++++++++++-----------
 etl/epc/property_change_app.py | 35 ++++++++++++++++++------------
 2 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py
index 8adac8df..8dbdbeb9 100644
--- a/etl/epc/DataProcessor.py
+++ b/etl/epc/DataProcessor.py
@@ -67,16 +67,17 @@ class DataProcessor:
     Handle data loading and data preprocessing
     """
 
-    def __init__(self, filepath: Path | None, newdata: bool = False) -> None:
+    def __init__(self, filepath: Path | None, is_newdata: bool = False) -> None:
         """
         :param filepath: If specified, is the physical location of the data
-        :param newdata: Indicates if we are processing new, testing data.
+        :param is_newdata: Indicates if we are processing new, testing data.
                         In this instance, there are some operations we do not
                         want to perform, such as confine_data()
         """
         self.filepath = filepath
         self.data = None
-        self.newdata = newdata
+        self.cleaning_averages = None
+        self.is_newdata = is_newdata
 
     def load_data(self, low_memory=False) -> None:
         if not self.filepath:
@@ -130,6 +131,7 @@ class DataProcessor:
         TODO: We could use a model based impution approach for possibly more accurate cleaning
         """
 
+        # TODO: DO we want to move this out of this function? (i.e. alter the data before we do any cleaning)
         self.data["POSTAL_AREA"] = self.data["POSTCODE"].apply(lambda x: x.split(" ")[0])
 
         def apply_clean(data, matching_columns):
@@ -174,13 +176,13 @@ class DataProcessor:
         if self.data is None:
             self.load_data(low_memory=DATA_PROCESSOR_SETTINGS["low_memory"])
 
-        if not self.newdata:
+        if not self.is_newdata:
             self.confine_data()
 
         self.remap_columns()
 
         # We have some non-standard construction age bands which we'll clean for matching
-        if not self.newdata:
+        if not self.is_newdata:
             self.standardise_construction_age_band()
             self.clean_missing_rooms()
 
@@ -188,12 +190,12 @@ class DataProcessor:
             column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"]
         )
 
-        if not self.newdata:
+        if not self.is_newdata:
             self.clean_multi_glaze_proportion()
 
         self.clean_photo_supply()
 
-        if not self.newdata:
+        if not self.is_newdata:
             self.retain_multiple_epc_properties(
                 epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"]
             )
@@ -202,24 +204,37 @@ class DataProcessor:
             # If we have multiple EPC records, we can try and do filling
             self.fill_na_fields()
 
-        if not self.newdata:
+        if not self.is_newdata:
             self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
 
         # Final re-casting after data transformed and prepared
-        coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.newdata else COLUMNTYPES
+        coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.is_newdata else COLUMNTYPES
         for k, v in coltypes.items():
             self.data[k] = self.data[k].astype(v)
         self.data = self.data.astype(coltypes)
 
         self.na_remapping()
 
-        return self.data
+        if not self.is_newdata:
+            # We have some odd cases with missing constituency so we fill
+            self.data = self.data.fillna({"CONSTITUENCY": df["CONSTITUENCY"].mode().values[0]})
+
+            self.cleaning_averages = self.make_cleaning_averages()
+            # We apply averages cleaning to the data
+            self.data = self.apply_averages_cleaning(
+                data_to_clean=self.data,
+                cleaning_data=self.cleaning_averages,
+                cols_to_merge_on=COLUMNS_TO_MERGE_ON
+            )
+            
+            self.cleaning_averages["LOCAL_AUTHORITY"] = self.data["LOCAL_AUTHORITY"].values[0]
+
 
     def na_remapping(self):
 
         fill_na_map_apply = {
             k: v for k, v in fill_na_map.items() if k in self.data.columns
-        } if self.newdata else fill_na_map
+        } if self.is_newdata else fill_na_map
 
         for column, fill_value in fill_na_map_apply.items():
             self.data[column] = self.data[column].fillna(fill_value)
@@ -264,7 +279,7 @@ class DataProcessor:
         data = data.replace(np.NAN, None)
 
         # Remap certain columns
-        if not self.newdata:
+        if not self.is_newdata:
             data["FLOOR_LEVEL"] = data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP)
         data["BUILT_FORM"] = data["BUILT_FORM"].replace(BUILT_FORM_REMAP)
 
diff --git a/etl/epc/property_change_app.py b/etl/epc/property_change_app.py
index 18228cb2..a7f9db12 100644
--- a/etl/epc/property_change_app.py
+++ b/etl/epc/property_change_app.py
@@ -397,6 +397,22 @@ def compare_records(earliest_record: pd.Series, latest_record: pd.Series, column
     if all_equal:
         return True
 
+class EPCRecord:
+    """
+    Base class for a EPC record
+    """
+
+    def __init__(self, num) -> None:
+        self.num = num
+
+    def __sub__(self, other: EPCRecord):
+        return self.num - other.num
+    
+
+test = EPCRecord(10)
+test2 = EPCRecord(20)
+test - test2
+
 
 def app():
     # Get all the files in the directory
@@ -419,18 +435,12 @@ def app():
 
         data_processor = DataProcessor(filepath=filepath)
 
-        df = data_processor.pre_process()
+        data_processor.pre_process()
+        
+        df = data_processor.data
+        cleaning_averages = data_processor.cleaning_averages
 
-        cleaning_averages = data_processor.make_cleaning_averages()
-
-        # We have some odd cases with missing constituency so we fill
-        df = df.fillna({"CONSTITUENCY": df["CONSTITUENCY"].mode().values[0]})
-
-        df = DataProcessor.apply_averages_cleaning(
-            data_to_clean=df,
-            cleaning_data=cleaning_averages,
-            cols_to_merge_on=COLUMNS_TO_MERGE_ON
-        )
+        cleaning_dataset.append(cleaning_averages)
 
         data_by_urpn = []
         for uprn, property_data in df.groupby("UPRN", observed=True):
@@ -592,9 +602,6 @@ def app():
 
         dataset.append(data_by_urpn_df)
 
-        cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]
-        cleaning_dataset.append(cleaning_averages)
-
     print("Final all equal count: %s" % str(len(all_equal_rows)))
 
     # Store cleaning dataset in s3 as a parquet file