fixed but which is dropping roof description

This commit is contained in:
Khalim Conn-Kowlessar 2023-09-17 22:20:23 +01:00
parent b71e76449f
commit 10fc349114
3 changed files with 20 additions and 20 deletions

View file

@ -414,6 +414,7 @@ class DataProcessor:
# We observed 7 final records with missing windows and 2 records with missing hot water so we shall remove them
self.data = self.data[~pd.isnull(self.data["WINDOWS_DESCRIPTION"])]
self.data = self.data[~pd.isnull(self.data["HOTWATER_DESCRIPTION"])]
self.data = self.data[~pd.isnull(self.data["ROOF_DESCRIPTION"])]
def clean_multi_glaze_proportion(self) -> None:
"""

View file

@ -211,4 +211,5 @@ fill_na_map = {
"LOW_ENERGY_LIGHTING": 0,
"MAINHEATCONT_DESCRIPTION": "Unknown",
"EXTENSION_COUNT": 0,
"NUMBER_OPEN_FIREPLACES": 0
}

View file

@ -81,11 +81,11 @@ def process_and_prune_desriptions(df, cleaned_lookup):
'no_data_ENDING',
],
"roof": [
'original_description', 'clean_description', 'thermal_transmittance',
'original_description', 'thermal_transmittance',
'thermal_transmittance_unit', 'is_pitched', 'is_roof_room', 'is_loft',
'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed',
'has_dwelling_above', 'is_valid', 'insulation_thickness',
'original_description_ENDING', 'clean_description_ENDING',
'original_description_ENDING',
'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING',
'is_pitched_ENDING', 'is_roof_room_ENDING', 'is_loft_ENDING',
'is_flat_ENDING', 'is_thatched_ENDING', 'is_at_rafters_ENDING',
@ -180,13 +180,25 @@ def app():
df = data_processor.pre_process()
cleaning_averages = data_processor.make_cleaning_averages()
# We have some odd cases with missing constituency so we fill
df = df.fillna({"CONSTITUENCY": df["CONSTITUENCY"].mode().values[0]})
df = DataProcessor.apply_averages_cleaning(
data_to_clean=df,
cleaning_data=cleaning_averages,
cols_to_merge_on=COLUMNS_TO_MERGE_ON
)
data_by_urpn = []
for uprn, property_data in df.groupby("UPRN", observed=True):
# Fixed features - these are property attributes that shouldn't change over time
fixed_data = {}
# If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row
if any(property_data[MANDATORY_FIXED_FEATURES].nunique() > 1):
if any(property_data[MANDATORY_FIXED_FEATURES].nunique() > 1) or (
pd.isnull(property_data[MANDATORY_FIXED_FEATURES]).sum().sum() > 0
):
continue
# Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS
@ -195,36 +207,22 @@ def app():
property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict()
)
# Extract the columns that are not all None
modified_property_data = DataProcessor.apply_averages_cleaning(
data_to_clean=property_data,
cleaning_data=cleaning_averages,
cols_to_merge_on=COLUMNS_TO_MERGE_ON
)
# Combine all fields together
fixed_data.update(mandatory_field_data)
fixed_data.update(latest_field_data)
# Apply cleaning to fixed_data
fixed_data = DataProcessor.apply_averages_cleaning(
data_to_clean=pd.DataFrame([fixed_data]),
cleaning_data=cleaning_averages,
cols_to_merge_on=COLUMNS_TO_MERGE_ON
).to_dict("records")[0]
# We include the lodgement date here as we probably need to factor time into the
# model, since EPC standards and rigour have changed over time
variable_data = modified_property_data[
variable_data = property_data[
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE]
]
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
# e.g. first vs second, second vs third and also first vs third
property_model_data = []
for idx in range(0, modified_property_data.shape[0] - 1):
for idx in range(0, property_data.shape[0] - 1):
if idx >= modified_property_data.shape[0] - 1:
if idx >= property_data.shape[0] - 1:
break
earliest_record = variable_data.iloc[idx]