diff --git a/model_data/simulation_system/core/DataProcessor.py b/model_data/simulation_system/core/DataProcessor.py index ba3cee33..878778de 100644 --- a/model_data/simulation_system/core/DataProcessor.py +++ b/model_data/simulation_system/core/DataProcessor.py @@ -414,6 +414,7 @@ class DataProcessor: # We observed 7 final records with missing windows and 2 records with missing hot water so we shall remove them self.data = self.data[~pd.isnull(self.data["WINDOWS_DESCRIPTION"])] self.data = self.data[~pd.isnull(self.data["HOTWATER_DESCRIPTION"])] + self.data = self.data[~pd.isnull(self.data["ROOF_DESCRIPTION"])] def clean_multi_glaze_proportion(self) -> None: """ diff --git a/model_data/simulation_system/core/Settings.py b/model_data/simulation_system/core/Settings.py index 0135c14a..935ae940 100644 --- a/model_data/simulation_system/core/Settings.py +++ b/model_data/simulation_system/core/Settings.py @@ -211,4 +211,5 @@ fill_na_map = { "LOW_ENERGY_LIGHTING": 0, "MAINHEATCONT_DESCRIPTION": "Unknown", "EXTENSION_COUNT": 0, + "NUMBER_OPEN_FIREPLACES": 0 } diff --git a/model_data/simulation_system/generate_rdsap_change.py b/model_data/simulation_system/generate_rdsap_change.py index 5a2f60c3..ddb5658d 100644 --- a/model_data/simulation_system/generate_rdsap_change.py +++ b/model_data/simulation_system/generate_rdsap_change.py @@ -81,11 +81,11 @@ def process_and_prune_desriptions(df, cleaned_lookup): 'no_data_ENDING', ], "roof": [ - 'original_description', 'clean_description', 'thermal_transmittance', + 'original_description', 'thermal_transmittance', 'thermal_transmittance_unit', 'is_pitched', 'is_roof_room', 'is_loft', 'is_flat', 'is_thatched', 'is_at_rafters', 'is_assumed', 'has_dwelling_above', 'is_valid', 'insulation_thickness', - 'original_description_ENDING', 'clean_description_ENDING', + 'original_description_ENDING', 'thermal_transmittance_ENDING', 'thermal_transmittance_unit_ENDING', 'is_pitched_ENDING', 'is_roof_room_ENDING', 'is_loft_ENDING', 'is_flat_ENDING', 'is_thatched_ENDING', 'is_at_rafters_ENDING', @@ -180,13 +180,25 @@ def app(): df = data_processor.pre_process() cleaning_averages = data_processor.make_cleaning_averages() + # We have some odd cases with missing constituency so we fill + df = df.fillna({"CONSTITUENCY": df["CONSTITUENCY"].mode().values[0]}) + + df = DataProcessor.apply_averages_cleaning( + data_to_clean=df, + cleaning_data=cleaning_averages, + cols_to_merge_on=COLUMNS_TO_MERGE_ON + ) + data_by_urpn = [] for uprn, property_data in df.groupby("UPRN", observed=True): + # Fixed features - these are property attributes that shouldn't change over time fixed_data = {} # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row - if any(property_data[MANDATORY_FIXED_FEATURES].nunique() > 1): + if any(property_data[MANDATORY_FIXED_FEATURES].nunique() > 1) or ( + pd.isnull(property_data[MANDATORY_FIXED_FEATURES]).sum().sum() > 0 + ): continue # Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS @@ -195,36 +207,22 @@ def app(): property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict() ) - # Extract the columns that are not all None - modified_property_data = DataProcessor.apply_averages_cleaning( - data_to_clean=property_data, - cleaning_data=cleaning_averages, - cols_to_merge_on=COLUMNS_TO_MERGE_ON - ) - # Combine all fields together fixed_data.update(mandatory_field_data) fixed_data.update(latest_field_data) - # Apply cleaning to fixed_data - fixed_data = DataProcessor.apply_averages_cleaning( - data_to_clean=pd.DataFrame([fixed_data]), - cleaning_data=cleaning_averages, - cols_to_merge_on=COLUMNS_TO_MERGE_ON - ).to_dict("records")[0] - # We include the lodgement date here as we probably need to factor time into the # model, since EPC standards and rigour have changed over time - variable_data = modified_property_data[ + variable_data = property_data[ COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE] ] # Note: we look at changes between subsequent EPCS, however we could look at other permutations # e.g. first vs second, second vs third and also first vs third property_model_data = [] - for idx in range(0, modified_property_data.shape[0] - 1): + for idx in range(0, property_data.shape[0] - 1): - if idx >= modified_property_data.shape[0] - 1: + if idx >= property_data.shape[0] - 1: break earliest_record = variable_data.iloc[idx]