diff --git a/model_data/simulation_system/core/DataProcessor.py b/model_data/simulation_system/core/DataProcessor.py index 9863ec8e..a0e0bbc8 100644 --- a/model_data/simulation_system/core/DataProcessor.py +++ b/model_data/simulation_system/core/DataProcessor.py @@ -33,19 +33,132 @@ class DataProcessor: def insert_data(self, data: pd.DataFrame) -> None: self.data = data + def standardise_construction_age_band(self): + """ + This function will tidy up some of the non-standard values that are populated in the construction age + band, which is useful for cleaning + """ + bounds_map = { + "England and Wales: before 1900": {"l": 0, "u": 1899}, + "England and Wales: 1930-1949": {"l": 1930, "u": 1949}, + "England and Wales: 1900-1929": {"l": 1900, "u": 1929}, + "England and Wales: 1950-1966": {"l": 1950, "u": 1966}, + "England and Wales: 1967-1975": {"l": 1967, "u": 1975}, + "England and Wales: 1976-1982": {"l": 1976, "u": 1982}, + "England and Wales: 1983-1990": {"l": 1983, "u": 1990}, + "England and Wales: 1991-1995": {"l": 1991, "u": 1995}, + "England and Wales: 1996-2002": {"l": 1996, "u": 2002}, + "England and Wales: 2003-2006": {"l": 2003, "u": 2006}, + "England and Wales: 2007-2011": {"l": 2007, "u": 2011}, + "England and Wales: 2012 onwards": {"l": 2012, "u": 3000}, + } + + remap = { + "England and Wales: 2007 onwards": "England and Wales: 2007-2011" + } + + expanded_map = { + i: [ + label for label, bounds in bounds_map.items() if (i <= bounds["u"]) and (i >= bounds['l']) + ][0] for i in range(0, 3001) + } + + def is_int(x): + try: + int(x) + return True + except: + return False + + def clean_construction_age_band(x): + # Firstly, we check if it's an error value + if x in Definitions.DATA_ANOMALY_MATCHES or x in [None, np.nan]: + return x + + # Next, we check if it's a value in our map + if bounds_map.get(x): + return x + + # We check if it's a standard remap value + remap_value = remap.get(x, None) + if remap_value: + return remap_value + + # We check if it's a number + if is_int(x): + x_int = int(x) + return expanded_map[x_int] + + raise NotImplementedError("Not handled the case for value %s" % x) + + self.data["CONSTRUCTION_AGE_BAND_CLEANED"] = self.data["CONSTRUCTION_AGE_BAND"].apply( + lambda x: clean_construction_age_band(x) + ) + + def clean_missing_rooms(self): + """ + For the number of heated rooms and number of habitable rooms, we clean these values up front, + based on property archetype and age + + TODO: We could use a model based impution approach for possibly more accurate cleaning + """ + + self.data["POSTAL_AREA"] = self.data["POSTCODE"].apply(lambda x: x.split(" ")[0]) + + def apply_clean(data, matching_columns): + + cleaning_data = data[~pd.isnull(data[col])].groupby( + matching_columns + )[col].median().reset_index() + + data = data.merge( + cleaning_data, how="left", on=matching_columns, suffixes=("", "_CLEANING") + ) + + data[col] = np.where(pd.isnull(data[col]), data[f"{col}_CLEANING"], data[col]) + data = data.drop(columns=f"{col}_CLEANING") + return data + + for col in ["NUMBER_HEATED_ROOMS", "NUMBER_HABITABLE_ROOMS"]: + + to_index = 3 + matching_columns = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND_CLEANED", "POSTAL_AREA"] + has_missings = pd.isnull(self.data[col]).sum() + while has_missings: + self.data = apply_clean( + data=self.data, + matching_columns=matching_columns[0:to_index + 1] + ) + has_missings = pd.isnull(self.data[col]).sum() + + if not has_missings or to_index == 0: + # Check if we've gotten to index 0 and still have missings - something has gone wrong or + # we have a very unique property type + if has_missings: + raise NotImplementedError("Handle this edge case, we still have missings for column %s" % col) + + break + to_index -= 1 + def pre_process(self) -> pd.DataFrame: """ Load data and begin initial cleaning """ if not self.data: self.load_data(low_memory=DATA_PROCESSOR_SETTINGS["low_memory"]) + self.confine_data() - # TODO: CLean number of heated rooms and habitable rooms + # We have some non-standard construction age bands which we'll clean for matching + self.standardise_construction_age_band() + self.clean_missing_rooms() + self.recast_df_columns( column_mappings=DATA_PROCESSOR_SETTINGS["column_mappings"] ) self.clean_multi_glaze_proportion() + self.clean_photo_supply() + self.retain_multiple_epc_properties( epc_minimum_count=DATA_PROCESSOR_SETTINGS["epc_minimum_count"] ) @@ -235,8 +348,7 @@ class DataProcessor: for key, values in column_mappings.items(): if key not in self.data.columns: - print("Column mapping incorrectly specified") - exit(1) + raise ValueError("Column mapping incorrectly specified") for value in values: self.data[key] = self.data[key].astype(value) @@ -272,6 +384,13 @@ class DataProcessor: ) & (self.data["WINDOWS_DESCRIPTION"].isin(FULLY_GLAZED_DESCRIPTIONS)) self.data.loc[no_multi_glaze_proportion_index, "MULTI_GLAZE_PROPORTION"] = 100 + def clean_photo_supply(self) -> None: + """ + We fill photo supply with zeros where it's missing + """ + + self.data["PHOTO_SUPPLY"] = self.data["PHOTO_SUPPLY"].fillna(0) + @staticmethod def apply_averages_cleaning(data_to_clean, cleaning_data, cols_to_merge_on): """ diff --git a/model_data/simulation_system/core/Settings.py b/model_data/simulation_system/core/Settings.py index 9f6c2e12..259acddd 100644 --- a/model_data/simulation_system/core/Settings.py +++ b/model_data/simulation_system/core/Settings.py @@ -53,6 +53,12 @@ DEPLOYMENT_FOLDER = "deployment" TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70 FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45 +# If we have multiple records for a numerical field, such as floor area, +# we check the margine for error between the biggest and lowest values. If we see large +# swings in measured values, we take the most recent value for this field as we interpret this +# as inaccurate measurements in the past and use the most recent value +MULTIPLE_VALUES_MARGIN_FOR_ERROR = 0.1 + COLUMNS_TO_MERGE_ON = [ "PROPERTY_TYPE", "BUILT_FORM", diff --git a/model_data/simulation_system/generate_rdsap_change.py b/model_data/simulation_system/generate_rdsap_change.py index 502f7a06..80991e82 100644 --- a/model_data/simulation_system/generate_rdsap_change.py +++ b/model_data/simulation_system/generate_rdsap_change.py @@ -11,6 +11,7 @@ from simulation_system.core.Settings import ( RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, COLUMNS_TO_MERGE_ON, + MULTIPLE_VALUES_MARGIN_FOR_ERROR, ) from simulation_system.core.DataProcessor import DataProcessor from utils import save_dataframe_to_s3_parquet @@ -32,10 +33,60 @@ def app(): dataset = [] cleaning_dataset = [] - # 116 - # 128048706 - # PosixPath('/home/ubuntu/Documents/python/hestia/Model/model_data/simulation_system/data/all-domestic - # -certificates/domestic-E09000021-Kingston-upon-Thames') + + # TODO: Does energy tariff make a difference + # TODO: If SAP hasn't changed, we don't include the record + # TODO: Floor area will impact the EPC so instead of averaging, we should have a starting and ending value. + # TODO: Same as floor area for floor height + # TODO: If fundamental building fabric changes, we should proabably discard the record + # TODO: Should we prune records that have an exceptionally large amount of time between them? + # TODO: If we have multiple EPCs lodged on the same day, should we remove them? Could be corrections? + # + # TODO: REMOVE ME + dodgy_uprns = [] + observed_uprns = [ + "10002082244", # Doesn't really make sense, house no longer has lel and not has more insulation but lower score + "10002082259", + # Property has more roof insulation, lel, but now the floor isn't insulated and has a lower score. Also the + # floor assessment is now assumed whereas before it wasnt + "10002082418", # Walls went from insulated to not... + "10002082640", # Property identical besides different energy taffiff + "10002082830", # Lots of records going from not insulated to insulated but some parts of + # the property has gotten better + "10002083244", # latest epc indicates the property is worse + "10002083592", # lastest epc doesn't have a fuel system present, but has slightly more insulation. Also the + # floor type has changed from solid to syspended. lel has decreased + "100030533576", # property slightly worse, has less lels and the floor description has changed type + "100030533668", # has slightly less lels. Glazed type is now missing + "100030533803", # Not super clea why this is lower, newer epc has more lel but is using second heating + "100030534016", # Property has less lel but more roof insulation. Floor type has changed + "100030534040", # property has less lel and the floor type has changed + "100030534041", # property has less insulation and less lel + "100030534243", # Cavity wall has gone from filled to unfilled + "100030534294", # less roof insulation but now has an air source heat pump + "100030534322", # identical between records but now with higher lel but no change recorded + "100030534413", # identical between records but different energy tariff, no sap change + "100030534437", # property has less lel and the mainheating no longer has a programmer and trvs + "100030534569", # Cavity wall no longer filled, 30mm more roof insulation in newest epc + "100030534676", # Property has less lel, is now using secondary heating, has 50mm less roof insulation, but + # the wall cavity is no longer filled + "100030534732", # property has higher lel %. Not clear why this is worse, glazing type has changed. + # This looks dodgy has the UPRN_SOURCE is address matched also the floor area has increased from the first to + # the later epc + "100030534791", # Property has started using secondary heating - the EPCs are taken on the same day so maybe we + # should discard + "100030534795", # More lel but a lot less insulation. This is a very dodgy record, sap has gone from 90 to 66 + # The newer epc indicates the property now has 40% photo supply so this doesn't make much sense + "100030534897", # Roof has gone from thatched with additional insulation to pitched with insulation, + # sap score hasn't changed + "100030534986", # Property has gone from 300mm loft insulation to none. has 2% higher lel (negligible) and + # slightly better main heating setup + "100030535043", # Property lel increased by 12%, not clear why sap worse. Maybe due to different floor area and + # wall height + "100030535173", # lel increased from 20% to 80% but roof gone from 100m insulation to "limited" insulation + "100030535244", # lel gone from 100% to 0%, sap is the same + ] + for directory in tqdm(directories): filepath = directory / "certificates.csv" @@ -74,9 +125,9 @@ def app(): vals = list(modified_property_data[field].dropna().unique()) if len(vals) > 1: - # Check the values are too far apart - # TODO: we could have multiple values here, why only use the first two? - if abs(vals[0] - vals[1]) / vals[0] > 0.1: + lowest_value = min(vals) + largest_value = max(vals) + if abs(largest_value - lowest_value) / lowest_value > MULTIPLE_VALUES_MARGIN_FOR_ERROR: # Take the more recent value since it's likely to be more accurate vals = [vals[-1]] @@ -111,6 +162,26 @@ def app(): - starting_record[HEAT_DEMAND_RESPONSE] ) + # Check for a change in the starting and ending record + check_cols = [ + col for col in starting_record.index if col not in [ + "LODGEMENT_DATE", "CURRENT_ENERGY_EFFICIENCY", "ENERGY_CONSUMPTION_CURRENT", "ENERGY_TARIFF" + ] + ] + all_same = True + for col in check_cols: + if starting_record[col] != ending_record[col]: + all_same = False + break + + if rdsap_change <= 0: + if all_same | (uprn in observed_uprns): + if uprn not in observed_uprns: + dodgy_uprns.append(uprn) + else: + compare = pd.concat([starting_record, ending_record], axis=1) + bljd + # TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and # floors, we may want to use the U-value. We may also want to handle the (assumed) tags # within descriptions