From 4102d23063072e2f5eeba2bb58e680f48c765695 Mon Sep 17 00:00:00 2001 From: Michael Duong Date: Sat, 2 Dec 2023 11:24:01 +0000 Subject: [PATCH] testing a change --- etl/epc/DataProcessor.py | 39 +++++++++++++++++++++++----------- etl/epc/property_change_app.py | 35 ++++++++++++++++++------------ 2 files changed, 48 insertions(+), 26 deletions(-) diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py index 8adac8df..8dbdbeb9 100644 --- a/etl/epc/DataProcessor.py +++ b/etl/epc/DataProcessor.py @@ -67,16 +67,17 @@ class DataProcessor: Handle data loading and data preprocessing """ - def __init__(self, filepath: Path | None, newdata: bool = False) -> None: + def __init__(self, filepath: Path | None, is_newdata: bool = False) -> None: """ :param filepath: If specified, is the physical location of the data - :param newdata: Indicates if we are processing new, testing data. + :param is_newdata: Indicates if we are processing new, testing data. In this instance, there are some operations we do not want to perform, such as confine_data() """ self.filepath = filepath self.data = None - self.newdata = newdata + self.cleaning_averages = None + self.is_newdata = is_newdata def load_data(self, low_memory=False) -> None: if not self.filepath: @@ -130,6 +131,7 @@ class DataProcessor: TODO: We could use a model based impution approach for possibly more accurate cleaning """ + # TODO: DO we want to move this out of this function? (i.e. alter the data before we do any cleaning) self.data["POSTAL_AREA"] = self.data["POSTCODE"].apply(lambda x: x.split(" ")[0]) def apply_clean(data, matching_columns): @@ -174,13 +176,13 @@ class DataProcessor: if self.data is None: self.load_data(low_memory=DATA_PROCESSOR_SETTINGS["low_memory"]) - if not self.newdata: + if not self.is_newdata: self.confine_data() self.remap_columns() # We have some non-standard construction age bands which we'll clean for matching - if not self.newdata: + if not self.is_newdata: self.standardise_construction_age_band() self.clean_missing_rooms() @@ -188,12 +190,12 @@ class DataProcessor: column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"] ) - if not self.newdata: + if not self.is_newdata: self.clean_multi_glaze_proportion() self.clean_photo_supply() - if not self.newdata: + if not self.is_newdata: self.retain_multiple_epc_properties( epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"] ) @@ -202,24 +204,37 @@ class DataProcessor: # If we have multiple EPC records, we can try and do filling self.fill_na_fields() - if not self.newdata: + if not self.is_newdata: self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True) # Final re-casting after data transformed and prepared - coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.newdata else COLUMNTYPES + coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.is_newdata else COLUMNTYPES for k, v in coltypes.items(): self.data[k] = self.data[k].astype(v) self.data = self.data.astype(coltypes) self.na_remapping() - return self.data + if not self.is_newdata: + # We have some odd cases with missing constituency so we fill + self.data = self.data.fillna({"CONSTITUENCY": df["CONSTITUENCY"].mode().values[0]}) + + self.cleaning_averages = self.make_cleaning_averages() + # We apply averages cleaning to the data + self.data = self.apply_averages_cleaning( + data_to_clean=self.data, + cleaning_data=self.cleaning_averages, + cols_to_merge_on=COLUMNS_TO_MERGE_ON + ) + + self.cleaning_averages["LOCAL_AUTHORITY"] = self.data["LOCAL_AUTHORITY"].values[0] + def na_remapping(self): fill_na_map_apply = { k: v for k, v in fill_na_map.items() if k in self.data.columns - } if self.newdata else fill_na_map + } if self.is_newdata else fill_na_map for column, fill_value in fill_na_map_apply.items(): self.data[column] = self.data[column].fillna(fill_value) @@ -264,7 +279,7 @@ class DataProcessor: data = data.replace(np.NAN, None) # Remap certain columns - if not self.newdata: + if not self.is_newdata: data["FLOOR_LEVEL"] = data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP) data["BUILT_FORM"] = data["BUILT_FORM"].replace(BUILT_FORM_REMAP) diff --git a/etl/epc/property_change_app.py b/etl/epc/property_change_app.py index 18228cb2..a7f9db12 100644 --- a/etl/epc/property_change_app.py +++ b/etl/epc/property_change_app.py @@ -397,6 +397,22 @@ def compare_records(earliest_record: pd.Series, latest_record: pd.Series, column if all_equal: return True +class EPCRecord: + """ + Base class for a EPC record + """ + + def __init__(self, num) -> None: + self.num = num + + def __sub__(self, other: EPCRecord): + return self.num - other.num + + +test = EPCRecord(10) +test2 = EPCRecord(20) +test - test2 + def app(): # Get all the files in the directory @@ -419,18 +435,12 @@ def app(): data_processor = DataProcessor(filepath=filepath) - df = data_processor.pre_process() + data_processor.pre_process() + + df = data_processor.data + cleaning_averages = data_processor.cleaning_averages - cleaning_averages = data_processor.make_cleaning_averages() - - # We have some odd cases with missing constituency so we fill - df = df.fillna({"CONSTITUENCY": df["CONSTITUENCY"].mode().values[0]}) - - df = DataProcessor.apply_averages_cleaning( - data_to_clean=df, - cleaning_data=cleaning_averages, - cols_to_merge_on=COLUMNS_TO_MERGE_ON - ) + cleaning_dataset.append(cleaning_averages) data_by_urpn = [] for uprn, property_data in df.groupby("UPRN", observed=True): @@ -592,9 +602,6 @@ def app(): dataset.append(data_by_urpn_df) - cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0] - cleaning_dataset.append(cleaning_averages) - print("Final all equal count: %s" % str(len(all_equal_rows))) # Store cleaning dataset in s3 as a parquet file