fix weird cases for now

This commit is contained in:
Michael Duong 2024-02-22 20:22:11 +00:00
parent 955e72f0bb
commit ed407bc98b
3 changed files with 145 additions and 92 deletions

View file

@ -809,6 +809,7 @@ class TrainingDataset(BaseDataset):
# else:
# return self.__add__(other)
class RecordDataset(BaseDataset):
"""
A collection of EPCRecrods can be combined into a Dataset.
@ -824,25 +825,25 @@ class RecordDataset(BaseDataset):
self._expand_description_to_features(cleaned_lookup)
self._adjust_assumed_values_in_wall_descriptions()
self._generate_u_values_from_features()
# # TODO: For some of the features that we clean, we have either a true, false or possibly null value
# # Those nulls should be False. clean_missings_after_description_process handles this but shouldn't
# # need to
# # # TODO: For some of the features that we clean, we have either a true, false or possibly null value
# # # Those nulls should be False. clean_missings_after_description_process handles this but shouldn't
# # # need to
self._clean_missing_values()
self._null_validation(information="Clean Missing Values")
# self._remove_abnormal_change_in_floor_area()
# # self._remove_abnormal_change_in_floor_area()
self._ensure_numeric()
def _ensure_numeric(self):
"""
Ensure that all columns are numeric
"""
# TODO: move into EPCRecord record
uvalue_columns = [col for col in self.df.columns if "thermal_transmittance" in col]
uvalue_columns = [
col for col in self.df.columns if "thermal_transmittance" in col
]
for uvalue_col in uvalue_columns:
self.df[uvalue_col] = pd.to_numeric(self.df[uvalue_col])
def _clean_missing_values(self, ignore_cols=None):
missings = pd.isnull(self.df).sum()
missings = missings[missings > 0]
@ -859,17 +860,22 @@ class RecordDataset(BaseDataset):
else:
self.df[col] = self.df[col].fillna("Unknown")
@staticmethod
def _lambda_function_to_generate_roof_uvalue(row, is_end=False):
"""
Using the apply method, use the get_roof_u_value method to generate the u-value
"""
col_name = "roof_insulation_thickness" if not is_end else "roof_insulation_thickness_ending"
col_name = (
"roof_insulation_thickness"
if not is_end
else "roof_insulation_thickness_ending"
)
if row["has_dwelling_above"]:
if row["roof_thermal_transmittance"] != 0:
if (row["roof_thermal_transmittance"] != 0) & (
not pd.isnull(row["roof_thermal_transmittance"])
):
raise ValueError("Should have 0 u-value for roof")
return get_roof_u_value(
@ -881,16 +887,24 @@ class RecordDataset(BaseDataset):
is_flat=row["is_flat"],
is_pitched=row["is_pitched"],
is_at_rafters=row["is_at_rafters"],
age_band=england_wales_age_band_lookup[row["construction_age_band"]]
)
age_band=england_wales_age_band_lookup[row["construction_age_band"]],
)
@staticmethod
def _lambda_function_to_generate_wall_uvalue(row, is_end=False):
"""
Using the apply method, use the get_wall_u_value method to generate the u-value
"""
description_col_name = "walls_clean_description" if not is_end else "walls_clean_description_ending"
thermal_transistance_col_name = "walls_thermal_transmittance" if not is_end else "walls_thermal_transmittance_ending"
description_col_name = (
"walls_clean_description"
if not is_end
else "walls_clean_description_ending"
)
thermal_transistance_col_name = (
"walls_thermal_transmittance"
if not is_end
else "walls_thermal_transmittance_ending"
)
if pd.isnull(row[thermal_transistance_col_name]):
output = get_wall_u_value(
@ -903,17 +917,23 @@ class RecordDataset(BaseDataset):
output = row[thermal_transistance_col_name]
return output
@staticmethod
def _lambda_function_to_generate_floor_uvalue(row, is_end=False):
"""
Using the apply method, use the get_floor_u_value method to generate the u-value
"""
floor_thermal_col_name = "floor_thermal_transmittance" if not is_end else "floor_thermal_transmittance_ending"
floor_thermal_col_name = (
"floor_thermal_transmittance"
if not is_end
else "floor_thermal_transmittance_ending"
)
if row["another_property_below"]:
if row["floor_thermal_transmittance"] != 0:
if (row["floor_thermal_transmittance"] != 0) & (
not pd.isnull(row["floor_thermal_transmittance"])
):
raise ValueError("Should have 0 u-value for floor")
return 0
@ -922,19 +942,27 @@ class RecordDataset(BaseDataset):
if pd.isnull(uvalue):
insulation_col_name = "floor_insulation_thickness" if not is_end else "floor_insulation_thickness_ending"
floor_area_col_name = "estimated_perimeter" if not is_end else "estimated_perimeter_ending"
perimeter_col_name = "total_floor_area" if not is_end else "total_floor_area_ending"
insulation_col_name = (
"floor_insulation_thickness"
if not is_end
else "floor_insulation_thickness_ending"
)
floor_area_col_name = (
"estimated_perimeter" if not is_end else "estimated_perimeter_ending"
)
perimeter_col_name = (
"total_floor_area" if not is_end else "total_floor_area_ending"
)
uvalue = get_floor_u_value(
floor_type=row["floor_type"],
perimeter=row[floor_area_col_name],
area=row[perimeter_col_name],
insulation_thickness=row[insulation_col_name],
wall_type=row["wall_type"],
age_band=england_wales_age_band_lookup[row["construction_age_band"]]
)
floor_type=row["floor_type"],
perimeter=row[floor_area_col_name],
area=row[perimeter_col_name],
insulation_thickness=row[insulation_col_name],
wall_type=row["wall_type"],
age_band=england_wales_age_band_lookup[row["construction_age_band"]],
)
return uvalue
def _generate_u_values_from_features(self):
@ -947,58 +975,63 @@ class RecordDataset(BaseDataset):
# ~~~~~~~~~~~~~~~~~~
walls_uvalue = self.df.apply(
lambda row: self._lambda_function_to_generate_wall_uvalue(row),
axis=1
lambda row: self._lambda_function_to_generate_wall_uvalue(row), axis=1
)
walls_uvalue = self.df['walls_thermal_transmittance'].fillna(walls_uvalue)
walls_uvalue = self.df["walls_thermal_transmittance"].fillna(walls_uvalue)
# ~~~~~~~~~~~~~~~~~~
# Roof
# ~~~~~~~~~~~~~~~~~~
roof_uvalue = self.df.apply(
lambda row: self._lambda_function_to_generate_roof_uvalue(row),
axis=1
lambda row: self._lambda_function_to_generate_roof_uvalue(row), axis=1
)
roof_uvalue = self.df['roof_thermal_transmittance'].fillna(roof_uvalue)
roof_uvalue = self.df["roof_thermal_transmittance"].fillna(roof_uvalue)
# ~~~~~~~~~~~~~~~~~~
# Floor
# ~~~~~~~~~~~~~~~~~~
self.df['estimated_perimeter'] = self.df.apply(
lambda row: estimate_perimeter(row["total_floor_area"], row["number_habitable_rooms"]),
axis=1
self.df["estimated_perimeter"] = self.df.apply(
lambda row: estimate_perimeter(
row["total_floor_area"], row["number_habitable_rooms"]
),
axis=1,
)
self.df["floor_type"] = self.df["is_suspended"].replace({True: "suspended", False: "solid"})
self.df["floor_type"] = self.df["is_suspended"].replace(
{True: "suspended", False: "solid"}
)
self.df["wall_type"] = self.df.apply(
lambda row: get_wall_type(
is_cavity_wall=row["is_cavity_wall"],
is_solid_brick=row["is_solid_brick"],
is_timber_frame=row["is_timber_frame"],
is_granite_or_whinstone=row["is_granite_or_whinstone"],
is_cob=row["is_cob"],
is_cavity_wall=row["is_cavity_wall"],
is_solid_brick=row["is_solid_brick"],
is_timber_frame=row["is_timber_frame"],
is_granite_or_whinstone=row["is_granite_or_whinstone"],
is_cob=row["is_cob"],
is_sandstone_or_limestone=row["is_sandstone_or_limestone"],
is_system_built=row["is_system_built"],
is_park_home=row["is_park_home"]
),
axis=1
)
floor_uvalue = self.df.apply(
lambda row: self._lambda_function_to_generate_floor_uvalue(row),
axis=1
is_park_home=row["is_park_home"],
),
axis=1,
)
floor_uvalue = self.df['floor_thermal_transmittance'].fillna(floor_uvalue)
floor_uvalue = self.df.apply(
lambda row: self._lambda_function_to_generate_floor_uvalue(row), axis=1
)
floor_uvalue = self.df["floor_thermal_transmittance"].fillna(floor_uvalue)
for component in ["walls", "roof", "floor"]:
self.df[f"{component}_thermal_transmittance"] = self.df[f"{component}_thermal_transmittance"].fillna(eval(f"{component}_uvalue"))
self.df[f"{component}_thermal_transmittance"] = self.df[
f"{component}_thermal_transmittance"
].fillna(eval(f"{component}_uvalue"))
self.df = self.df.drop(columns=["floor_type", "wall_type", "walls_clean_description"])
self.df = self.df.drop(
columns=["floor_type", "wall_type", "walls_clean_description"]
)
def _adjust_assumed_values_in_wall_descriptions(self):
"""
@ -1007,7 +1040,6 @@ class RecordDataset(BaseDataset):
for col in ["walls_clean_description"]:
self.df[col] = self.df[col].str.replace("(assumed)", "").str.rstrip()
def _clean_efficiency_variables(self):
"""
These is scope to clean this by the model per corresponding description.
@ -1023,7 +1055,7 @@ class RecordDataset(BaseDataset):
missings = missings[missings >= 1]
if len(missings) == 0:
return
return
# Make sure they are all efficiency columns
if any(~missings.index.str.contains("energy_eff")):
@ -1033,13 +1065,11 @@ class RecordDataset(BaseDataset):
column_index = self.df[m].isna()
self.df.loc[column_index, m] = "NO_RATING"
def _null_validation(self, information: str):
print(f"Null validation after {information}")
if pd.isnull(self.df).sum().sum():
raise ValueError(f"Null values found in dataset, after step {information}")
def _expand_description_to_features(self, cleaned_lookup: dict):
"""
This method will merge on the cleaned lookup table and ensure that the building fabric in the
@ -1050,49 +1080,63 @@ class RecordDataset(BaseDataset):
# remove this record, as it indicates that the quality of the EPC conducted in the first instance
# is low
# We also replace descriptions with their cleaned variants
"""
"""
cols_to_drop = {
"walls": [
# We need to cleaned descriptions for pulling out u-values
'original_description', 'thermal_transmittance_unit',
"original_description",
"thermal_transmittance_unit",
# Re remove the is_assumed columns
"is_assumed"
"is_assumed",
],
"floor": [
"original_description", "clean_description", "thermal_transmittance_unit",
"no_data",
"is_assumed"
"original_description",
"clean_description",
"thermal_transmittance_unit",
"no_data",
"is_assumed",
],
"roof": [
"original_description", "clean_description", "thermal_transmittance_unit",
"is_assumed", "is_valid"
"original_description",
"clean_description",
"thermal_transmittance_unit",
"is_assumed",
"is_valid",
],
"hotwater": [
"original_description", "clean_description", "assumed",
"original_description",
"clean_description",
"assumed",
],
"mainheat": [
"original_description", "clean_description",
"original_description",
"clean_description",
"has_assumed",
],
"mainheatcont": [
"original_description", "clean_description",
"original_description",
"clean_description",
],
"windows": [
"original_description", "clean_description",
"original_description",
"clean_description",
# We don't need many of the glazing coverage features because we have the multi_glaze_proportion feature
"has_glazing", "glazing_coverage", "no_data",
"has_glazing",
"glazing_coverage",
"no_data",
],
"main-fuel": [
"original_description", "clean_description",
"original_description",
"clean_description",
],
}
components_to_expand = cols_to_drop.keys()
for component in components_to_expand:
# TODO: change cleaned dataframe to have underscores instead of dashes
# TODO: change cleaned dataframe to have underscores instead of dashes
if component == "main-fuel":
cleaned_key = "main-fuel"
left_on_key = "main_fuel"
@ -1108,11 +1152,13 @@ class RecordDataset(BaseDataset):
cleaned_lookup_df_for_key,
how="left",
left_on=left_on_key,
right_on="original_description"
right_on="original_description",
)
# Drop original cols and cols to drop
expanded_df = expanded_df.drop(columns=cols_to_drop[component] + original_cols)
expanded_df = expanded_df.drop(
columns=cols_to_drop[component] + original_cols
)
# Rename columns to component specific names, if they have not been dropped
expanded_df = expanded_df.rename(
@ -1124,17 +1170,16 @@ class RecordDataset(BaseDataset):
}
)
self.df = expanded_df
# We don't need any lighting specific cleaning, we just drop the original description as we use
# LOW_ENERGY_LIGHTING_STARTING, LOW_ENERGY_LIGHTING_ENDING
self.df = self.df.drop(columns=["lighting_description"])
# def __add__(self, other) -> "NewDataset":
# if not isinstance(other, NewDataset):
# raise TypeError("Addition can only be performed with another instance of ScoringDataset")
# return NewDataset(self.datasets + other.datasets)
# def __radd__(self, other):
# """
# Required for sum() to work
@ -1142,4 +1187,4 @@ class RecordDataset(BaseDataset):
# if isinstance(other, int):
# return self
# else:
# return self.__add__(other)
# return self.__add__(other)

View file

@ -87,9 +87,9 @@ class EPCPipeline:
run_mode="training",
epc_local_file="certificates.csv",
epc_bucket_name="retrofit-data-dev",
epc_cleaning_dataset_key="sap_change_model/cleaning_dataset_rooms.parquet",
epc_all_equal_rows_key="sap_change_model/all_equal_rows_rooms.parquet",
epc_compiled_dataset_key="sap_change_model/dataset_rooms.parquet",
epc_cleaning_dataset_key="sap_change_model/cleaning_dataset_record.parquet",
epc_all_equal_rows_key="sap_change_model/all_equal_rows_record.parquet",
epc_compiled_dataset_key="sap_change_model/dataset_record.parquet",
):
"""
:param directories: List of directories to process
@ -127,7 +127,6 @@ class EPCPipeline:
self.run_record_dataset_pipeline()
else:
raise ValueError("Run mode defined needs to be in 'training' or 'newdata'")
def run_record_dataset_pipeline(self):
"""
@ -150,9 +149,17 @@ class EPCPipeline:
)
# TODO: integrate with EPCRecord
record_dataset = constituency_data[['uprn'] + VARIABLE_DATA_FEATURES + MANDATORY_FIXED_FEATURES + LATEST_FIELD]
record_dataset = constituency_data[
["uprn"]
+ [RDSAP_RESPONSE]
+ VARIABLE_DATA_FEATURES
+ MANDATORY_FIXED_FEATURES
+ LATEST_FIELD
].rename(columns={RDSAP_RESPONSE: "sap"})
constituency_dataset = RecordDataset(datasets=record_dataset, cleaned_lookup=clean_lookup)
constituency_dataset = RecordDataset(
datasets=record_dataset, cleaned_lookup=clean_lookup
)
self.compiled_dataset = pd.concat(
[self.compiled_dataset, constituency_dataset.df]

View file

@ -12,10 +12,11 @@ def main():
"""
directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
# directories = directories[0:3]
# directories = directories[202:203]
epc_pipeline = EPCPipeline(
directories=directories,
run_mode="record",
epc_data_processor=EPCDataProcessor(run_mode="training"),
)