mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-30 13:10:47 +00:00
Merge branch 'etl-michael' of github.com:Hestia-Homes/Model into etl-michael
This commit is contained in:
commit
2845badbc0
2 changed files with 89 additions and 27 deletions
|
|
@ -67,16 +67,17 @@ class DataProcessor:
|
||||||
Handle data loading and data preprocessing
|
Handle data loading and data preprocessing
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, filepath: Path | None, newdata: bool = False) -> None:
|
def __init__(self, filepath: Path | None, is_newdata: bool = False) -> None:
|
||||||
"""
|
"""
|
||||||
:param filepath: If specified, is the physical location of the data
|
:param filepath: If specified, is the physical location of the data
|
||||||
:param newdata: Indicates if we are processing new, testing data.
|
:param is_newdata: Indicates if we are processing new, testing data.
|
||||||
In this instance, there are some operations we do not
|
In this instance, there are some operations we do not
|
||||||
want to perform, such as confine_data()
|
want to perform, such as confine_data()
|
||||||
"""
|
"""
|
||||||
self.filepath = filepath
|
self.filepath = filepath
|
||||||
self.data = None
|
self.data = None
|
||||||
self.newdata = newdata
|
self.cleaning_averages = None
|
||||||
|
self.is_newdata = is_newdata
|
||||||
|
|
||||||
def load_data(self, low_memory=False) -> None:
|
def load_data(self, low_memory=False) -> None:
|
||||||
if not self.filepath:
|
if not self.filepath:
|
||||||
|
|
@ -130,6 +131,7 @@ class DataProcessor:
|
||||||
TODO: We could use a model based impution approach for possibly more accurate cleaning
|
TODO: We could use a model based impution approach for possibly more accurate cleaning
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# TODO: DO we want to move this out of this function? (i.e. alter the data before we do any cleaning)
|
||||||
self.data["POSTAL_AREA"] = self.data["POSTCODE"].apply(lambda x: x.split(" ")[0])
|
self.data["POSTAL_AREA"] = self.data["POSTCODE"].apply(lambda x: x.split(" ")[0])
|
||||||
|
|
||||||
def apply_clean(data, matching_columns):
|
def apply_clean(data, matching_columns):
|
||||||
|
|
@ -174,13 +176,13 @@ class DataProcessor:
|
||||||
if self.data is None:
|
if self.data is None:
|
||||||
self.load_data(low_memory=DATA_PROCESSOR_SETTINGS["low_memory"])
|
self.load_data(low_memory=DATA_PROCESSOR_SETTINGS["low_memory"])
|
||||||
|
|
||||||
if not self.newdata:
|
if not self.is_newdata:
|
||||||
self.confine_data()
|
self.confine_data()
|
||||||
|
|
||||||
self.remap_columns()
|
self.remap_columns()
|
||||||
|
|
||||||
# We have some non-standard construction age bands which we'll clean for matching
|
# We have some non-standard construction age bands which we'll clean for matching
|
||||||
if not self.newdata:
|
if not self.is_newdata:
|
||||||
self.standardise_construction_age_band()
|
self.standardise_construction_age_band()
|
||||||
self.clean_missing_rooms()
|
self.clean_missing_rooms()
|
||||||
|
|
||||||
|
|
@ -188,12 +190,12 @@ class DataProcessor:
|
||||||
column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"]
|
column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"]
|
||||||
)
|
)
|
||||||
|
|
||||||
if not self.newdata:
|
if not self.is_newdata:
|
||||||
self.clean_multi_glaze_proportion()
|
self.clean_multi_glaze_proportion()
|
||||||
|
|
||||||
self.clean_photo_supply()
|
self.clean_photo_supply()
|
||||||
|
|
||||||
if not self.newdata:
|
if not self.is_newdata:
|
||||||
self.retain_multiple_epc_properties(
|
self.retain_multiple_epc_properties(
|
||||||
epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"]
|
epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"]
|
||||||
)
|
)
|
||||||
|
|
@ -202,24 +204,37 @@ class DataProcessor:
|
||||||
# If we have multiple EPC records, we can try and do filling
|
# If we have multiple EPC records, we can try and do filling
|
||||||
self.fill_na_fields()
|
self.fill_na_fields()
|
||||||
|
|
||||||
if not self.newdata:
|
if not self.is_newdata:
|
||||||
self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
|
self.data = self.data.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
|
||||||
|
|
||||||
# Final re-casting after data transformed and prepared
|
# Final re-casting after data transformed and prepared
|
||||||
coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.newdata else COLUMNTYPES
|
coltypes = {k: v for k, v in COLUMNTYPES.items() if k in self.data.columns} if self.is_newdata else COLUMNTYPES
|
||||||
for k, v in coltypes.items():
|
for k, v in coltypes.items():
|
||||||
self.data[k] = self.data[k].astype(v)
|
self.data[k] = self.data[k].astype(v)
|
||||||
self.data = self.data.astype(coltypes)
|
self.data = self.data.astype(coltypes)
|
||||||
|
|
||||||
self.na_remapping()
|
self.na_remapping()
|
||||||
|
|
||||||
return self.data
|
if not self.is_newdata:
|
||||||
|
# We have some odd cases with missing constituency so we fill
|
||||||
|
self.data = self.data.fillna({"CONSTITUENCY": self.data["CONSTITUENCY"].mode().values[0]})
|
||||||
|
|
||||||
|
self.cleaning_averages = self.make_cleaning_averages()
|
||||||
|
# We apply averages cleaning to the data
|
||||||
|
self.data = self.apply_averages_cleaning(
|
||||||
|
data_to_clean=self.data,
|
||||||
|
cleaning_data=self.cleaning_averages,
|
||||||
|
cols_to_merge_on=COLUMNS_TO_MERGE_ON
|
||||||
|
)
|
||||||
|
|
||||||
|
self.cleaning_averages["LOCAL_AUTHORITY"] = self.data["LOCAL_AUTHORITY"].values[0]
|
||||||
|
|
||||||
|
|
||||||
def na_remapping(self):
|
def na_remapping(self):
|
||||||
|
|
||||||
fill_na_map_apply = {
|
fill_na_map_apply = {
|
||||||
k: v for k, v in fill_na_map.items() if k in self.data.columns
|
k: v for k, v in fill_na_map.items() if k in self.data.columns
|
||||||
} if self.newdata else fill_na_map
|
} if self.is_newdata else fill_na_map
|
||||||
|
|
||||||
for column, fill_value in fill_na_map_apply.items():
|
for column, fill_value in fill_na_map_apply.items():
|
||||||
self.data[column] = self.data[column].fillna(fill_value)
|
self.data[column] = self.data[column].fillna(fill_value)
|
||||||
|
|
@ -264,7 +279,7 @@ class DataProcessor:
|
||||||
data = data.replace(np.NAN, None)
|
data = data.replace(np.NAN, None)
|
||||||
|
|
||||||
# Remap certain columns
|
# Remap certain columns
|
||||||
if not self.newdata:
|
if not self.is_newdata:
|
||||||
data["FLOOR_LEVEL"] = data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP)
|
data["FLOOR_LEVEL"] = data["FLOOR_LEVEL"].replace(FLOOR_LEVEL_MAP)
|
||||||
data["BUILT_FORM"] = data["BUILT_FORM"].replace(BUILT_FORM_REMAP)
|
data["BUILT_FORM"] = data["BUILT_FORM"].replace(BUILT_FORM_REMAP)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -397,6 +397,63 @@ def compare_records(earliest_record: pd.Series, latest_record: pd.Series, column
|
||||||
if all_equal:
|
if all_equal:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
@dataclass
|
||||||
|
class EPCRecord:
|
||||||
|
"""
|
||||||
|
Base class for a EPC record
|
||||||
|
"""
|
||||||
|
WALLS_DESCRIPTION: str
|
||||||
|
FLOOR_DESCRIPTION: str
|
||||||
|
LIGHTING_DESCRIPTION: str
|
||||||
|
ROOF_DESCRIPTION: str
|
||||||
|
MAINHEAT_DESCRIPTION: str
|
||||||
|
HOTWATER_DESCRIPTION: str
|
||||||
|
MAIN_FUEL: str
|
||||||
|
MECHANICAL_VENTILATION: str
|
||||||
|
SECONDHEAT_DESCRIPTION: str
|
||||||
|
WINDOWS_DESCRIPTION: str
|
||||||
|
GLAZED_TYPE: str
|
||||||
|
MULTI_GLAZE_PROPORTION: float
|
||||||
|
LOW_ENERGY_LIGHTING: float
|
||||||
|
NUMBER_OPEN_FIREPLACES: float
|
||||||
|
MAINHEATCONT_DESCRIPTION: str
|
||||||
|
SOLAR_WATER_HEATING_FLAG: str
|
||||||
|
PHOTO_SUPPLY: float
|
||||||
|
TRANSACTION_TYPE: str
|
||||||
|
ENERGY_TARIFF: str
|
||||||
|
EXTENSION_COUNT: float
|
||||||
|
TOTAL_FLOOR_AREA: float
|
||||||
|
FLOOR_HEIGHT: float
|
||||||
|
HOT_WATER_ENERGY_EFF: str
|
||||||
|
FLOOR_ENERGY_EFF: str
|
||||||
|
WINDOWS_ENERGY_EFF: str
|
||||||
|
WALLS_ENERGY_EFF: str
|
||||||
|
SHEATING_ENERGY_EFF: str
|
||||||
|
ROOF_ENERGY_EFF: str
|
||||||
|
MAINHEAT_ENERGY_EFF: str
|
||||||
|
MAINHEATC_ENERGY_EFF: str
|
||||||
|
LIGHTING_ENERGY_EFF: str
|
||||||
|
POTENTIAL_ENERGY_EFFICIENCY: float
|
||||||
|
ENVIRONMENT_IMPACT_POTENTIAL: float
|
||||||
|
ENERGY_CONSUMPTION_POTENTIAL: float
|
||||||
|
CO2_EMISSIONS_POTENTIAL: float
|
||||||
|
LODGEMENT_DATE: str
|
||||||
|
CURRENT_ENERGY_EFFICIENCY: int
|
||||||
|
ENERGY_CONSUMPTION_CURRENT: int
|
||||||
|
CO2_EMISSIONS_CURRENT: float
|
||||||
|
|
||||||
|
# def __init__(self, num) -> None:
|
||||||
|
# self.num = num
|
||||||
|
|
||||||
|
# def __sub__(self, other):
|
||||||
|
# return self.num - other.num
|
||||||
|
|
||||||
|
|
||||||
|
test = EPCRecord(10)
|
||||||
|
test2 = EPCRecord(20)
|
||||||
|
test - test2
|
||||||
|
|
||||||
|
|
||||||
def app():
|
def app():
|
||||||
# Get all the files in the directory
|
# Get all the files in the directory
|
||||||
|
|
@ -419,18 +476,11 @@ def app():
|
||||||
|
|
||||||
data_processor = DataProcessor(filepath=filepath)
|
data_processor = DataProcessor(filepath=filepath)
|
||||||
|
|
||||||
df = data_processor.pre_process()
|
data_processor.pre_process()
|
||||||
|
|
||||||
|
df = data_processor.data
|
||||||
|
|
||||||
cleaning_averages = data_processor.make_cleaning_averages()
|
cleaning_dataset.append(data_processor.cleaning_averages)
|
||||||
|
|
||||||
# We have some odd cases with missing constituency so we fill
|
|
||||||
df = df.fillna({"CONSTITUENCY": df["CONSTITUENCY"].mode().values[0]})
|
|
||||||
|
|
||||||
df = DataProcessor.apply_averages_cleaning(
|
|
||||||
data_to_clean=df,
|
|
||||||
cleaning_data=cleaning_averages,
|
|
||||||
cols_to_merge_on=COLUMNS_TO_MERGE_ON
|
|
||||||
)
|
|
||||||
|
|
||||||
data_by_urpn = []
|
data_by_urpn = []
|
||||||
for uprn, property_data in df.groupby("UPRN", observed=True):
|
for uprn, property_data in df.groupby("UPRN", observed=True):
|
||||||
|
|
@ -466,7 +516,7 @@ def app():
|
||||||
# e.g. first vs second, second vs third and also first vs third
|
# e.g. first vs second, second vs third and also first vs third
|
||||||
property_model_data = []
|
property_model_data = []
|
||||||
for idx in range(0, property_data.shape[0] - 1):
|
for idx in range(0, property_data.shape[0] - 1):
|
||||||
|
|
||||||
if idx >= property_data.shape[0] - 1:
|
if idx >= property_data.shape[0] - 1:
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
@ -592,9 +642,6 @@ def app():
|
||||||
|
|
||||||
dataset.append(data_by_urpn_df)
|
dataset.append(data_by_urpn_df)
|
||||||
|
|
||||||
cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]
|
|
||||||
cleaning_dataset.append(cleaning_averages)
|
|
||||||
|
|
||||||
print("Final all equal count: %s" % str(len(all_equal_rows)))
|
print("Final all equal count: %s" % str(len(all_equal_rows)))
|
||||||
|
|
||||||
# Store cleaning dataset in s3 as a parquet file
|
# Store cleaning dataset in s3 as a parquet file
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue