testing a change

This commit is contained in:
Michael Duong 2023-12-02 11:24:01 +00:00
parent c86be4a9b6
commit 4102d23063
2 changed files with 48 additions and 26 deletions

View file

@ -67,16 +67,17 @@ class DataProcessor:
Handle data loading and data preprocessing
"""
def __init__(self, filepath: Path | None, newdata: bool = False) -> None:
def __init__(self, filepath: Path | None, is_newdata: bool = False) -> None:
"""
:param filepath: If specified, is the physical location of the data
:param newdata: Indicates if we are processing new, testing data.
:param is_newdata: Indicates if we are processing new, testing data.
In this instance, there are some operations we do not
want to perform, such as confine_data()
"""
self.filepath = filepath
self.data = None
self.newdata = newdata
self.cleaning_averages = None
self.is_newdata = is_newdata
def load_data(self, low_memory=False) -> None:
if not self.filepath:
@ -130,6 +131,7 @@ class DataProcessor:
TODO: We could use a model based impution approach for possibly more accurate cleaning
"""
# TODO: DO we want to move this out of this function? (i.e. alter the data before we do any cleaning)
self.data["POSTAL_AREA"] = self.data["POSTCODE"].apply(lambda x: x.split(" ")[0])
def apply_clean(data, matching_columns):
@ -174,13 +176,13 @@ class DataProcessor:
if self.data is None:
self.load_data(low_memory=DATA_PROCESSOR_SETTINGS["low_memory"])
if not self.newdata:
if not self.is_newdata:
self.confine_data()
self.remap_columns()
# We have some non-standard construction age bands which we'll clean for matching
if not self.newdata:
if not self.is_newdata:
self.standardise_construction_age_band()
self.clean_missing_rooms()
@ -188,12 +190,12 @@ class DataProcessor:
column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"]
)
if not self.newdata:
if not self.is_newdata:
self.clean_multi_glaze_proportion()
self.clean_photo_supply()
if not self.newdata:
if not self.is_newdata:
self.retain_multiple_epc_properties(
epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"]
)
@ -202,24 +204,37 @@ class DataProcessor:
# If we have multiple EPC records, we can try and do filling
self.fill_na_fields()
if not self.newdata:
if not self.is_newdata:
self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
# Final re-casting after data transformed and prepared
coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.newdata else COLUMNTYPES
coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.is_newdata else COLUMNTYPES
for k, v in coltypes.items():
self.data[k] = self.data[k].astype(v)
self.data = self.data.astype(coltypes)
self.na_remapping()
return self.data
if not self.is_newdata:
# We have some odd cases with missing constituency so we fill
self.data = self.data.fillna({"CONSTITUENCY": df["CONSTITUENCY"].mode().values[0]})
self.cleaning_averages = self.make_cleaning_averages()
# We apply averages cleaning to the data
self.data = self.apply_averages_cleaning(
data_to_clean=self.data,
cleaning_data=self.cleaning_averages,
cols_to_merge_on=COLUMNS_TO_MERGE_ON
)
self.cleaning_averages["LOCAL_AUTHORITY"] = self.data["LOCAL_AUTHORITY"].values[0]
def na_remapping(self):
fill_na_map_apply = {
k: v for k, v in fill_na_map.items() if k in self.data.columns
} if self.newdata else fill_na_map
} if self.is_newdata else fill_na_map
for column, fill_value in fill_na_map_apply.items():
self.data[column] = self.data[column].fillna(fill_value)
@ -264,7 +279,7 @@ class DataProcessor:
data = data.replace(np.NAN, None)
# Remap certain columns
if not self.newdata:
if not self.is_newdata:
data["FLOOR_LEVEL"] = data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP)
data["BUILT_FORM"] = data["BUILT_FORM"].replace(BUILT_FORM_REMAP)

View file

@ -397,6 +397,22 @@ def compare_records(earliest_record: pd.Series, latest_record: pd.Series, column
if all_equal:
return True
class EPCRecord:
"""
Base class for a EPC record
"""
def __init__(self, num) -> None:
self.num = num
def __sub__(self, other: EPCRecord):
return self.num - other.num
test = EPCRecord(10)
test2 = EPCRecord(20)
test - test2
def app():
# Get all the files in the directory
@ -419,18 +435,12 @@ def app():
data_processor = DataProcessor(filepath=filepath)
df = data_processor.pre_process()
data_processor.pre_process()
df = data_processor.data
cleaning_averages = data_processor.cleaning_averages
cleaning_averages = data_processor.make_cleaning_averages()
# We have some odd cases with missing constituency so we fill
df = df.fillna({"CONSTITUENCY": df["CONSTITUENCY"].mode().values[0]})
df = DataProcessor.apply_averages_cleaning(
data_to_clean=df,
cleaning_data=cleaning_averages,
cols_to_merge_on=COLUMNS_TO_MERGE_ON
)
cleaning_dataset.append(cleaning_averages)
data_by_urpn = []
for uprn, property_data in df.groupby("UPRN", observed=True):
@ -592,9 +602,6 @@ def app():
dataset.append(data_by_urpn_df)
cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]
cleaning_dataset.append(cleaning_averages)
print("Final all equal count: %s" % str(len(all_equal_rows)))
# Store cleaning dataset in s3 as a parquet file