diff --git a/BaseUtility.py b/BaseUtility.py index bd2f091e..e799144d 100644 --- a/BaseUtility.py +++ b/BaseUtility.py @@ -45,7 +45,9 @@ class Definitions: # contain a ‘null’ value. A resolution to correct these anomalies will be considered for future data releases. "NULL", # We sometimes see fields populated with just an empty string. - "" + "", + # An older value which rarely shows up but has been seen in the data. + "UNKNOWN", } DATA_ANOMALY_SUBSTRINGS = { diff --git a/backend/Property.py b/backend/Property.py index 4d26857d..82695b75 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -13,7 +13,7 @@ from etl.epc_clean.epc_attributes.all_cleaners import all_cleaner_map from etl.solar.SolarPhotoSupply import SolarPhotoSupply from utils.logger import setup_logger from utils.s3 import read_dataframe_from_s3_parquet -from BaseUtility import Definitions +from etl.epc.settings import DATA_ANOMALY_MATCHES from recommendations.rdsap_tables import england_wales_age_band_lookup, FLOOR_LEVEL_MAP from recommendations.recommendation_utils import ( estimate_perimeter, get_wall_type, estimate_external_wall_area, esimtate_pitched_roof_area, estimate_windows @@ -25,7 +25,7 @@ DATA_BUCKET = os.environ.get('DATA_BUCKET', 'retrofit-data-dev' if ENVIRONMENT = logger = setup_logger() -class Property(Definitions): +class Property: ATTRIBUTE_MAP = { "floor-description": "floor", "hotwater-description": "hotwater", @@ -51,6 +51,8 @@ class Property(Definitions): spatial = None base_difference_record = None + DATA_ANOMALY_MATCHES = DATA_ANOMALY_MATCHES + def __init__(self, id, postcode, address, epc_record): self.epc_record = epc_record @@ -302,6 +304,7 @@ class Property(Definitions): self.set_basic_property_dimensions() for description, attribute in cleaned.items(): + if self.data[description] in self.DATA_ANOMALY_MATCHES: template = cleaned[description][0] fill_dict = dict(zip(template.keys(), [None] * len(template))) @@ -319,7 +322,7 @@ class Property(Definitions): attributes = [ x for x in cleaned[description] if x["original_description"] == self.data[description] ] - + if len(attributes) > 1: raise ValueError("Either No attributes or multiple found for %s" % description) diff --git a/etl/eligibility/Eligibility.py b/etl/eligibility/Eligibility.py index 13966655..6a5c03e1 100644 --- a/etl/eligibility/Eligibility.py +++ b/etl/eligibility/Eligibility.py @@ -233,6 +233,13 @@ class Eligibility: def room_roof_insulation(self): is_room_roof = self.roof["is_roof_room"] + if not is_room_roof: + self.room_roof = { + "suitability": False, + "thickness": None + } + return + insulation_thickness = convert_thickness_to_numeric( self.roof["insulation_thickness"], self.roof["is_pitched"], @@ -246,6 +253,14 @@ class Eligibility: def flat_roof_insulation(self): is_flat = self.roof["is_flat"] + + if not is_flat: + self.flat_roof = { + "suitability": False, + "thickness": None + } + return + insulation_thickness = convert_thickness_to_numeric( self.roof["insulation_thickness"], self.roof["is_pitched"], diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index 66183599..8ee5d743 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -154,6 +154,10 @@ class DataLoader: asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1) + # Finally, we process property_type or built form, where needed + if ha_name == "ha_6": + asset_list["built_form"] = asset_list["Property Type"].apply(self.identify_built_form_ha6) + return asset_list def load_survey_list(self, file_path, ha_name, asset_list, sheet_name=None): @@ -412,6 +416,34 @@ class DataLoader: return matching_lookup + @staticmethod + def identify_built_form_ha6(property_string): + """ + Identify the built form of a property from the given string. + + :param property_string: The string describing the property + :return: The identified built form, or None if it cannot be identified + """ + # Define keywords for each built form + built_forms = { + 'Semi-Detached': ['semi detached'], + 'Detached': ['detached'], + 'Mid-Terrace': ['mid terrace', 'mid town house'], + 'End-Terrace': ['end terrace', 'end town house'] + } + + # Normalize the input string to lower case for comparison + property_string_normalized = property_string.lower() + + # Search for each built form keyword in the input string + for built_form, keywords in built_forms.items(): + for keyword in keywords: + if keyword in property_string_normalized: + return built_form + + # Return None if no built form is identified + return None + def load(self): if self.use_cache: @@ -461,7 +493,7 @@ class DataLoader: def get_epc_data( - loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds + loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=True ): if not loader.data: raise ValueError("Data not found - please run loader.load() first") @@ -476,10 +508,39 @@ def get_epc_data( 'Enclosed Mid': 'Mid-Terrace', 'Detached Local Connect': 'Detached', } + }, + "ha_6": { + "property_type": { + 'HOUSE': "House", + 'GROUND FLOOR FLAT': "Flat", + 'UPPER FLOOR FLAT': "Flat", + 'MAISONETTE': "Maisonette", + 'BUNGALOW': "Bungalow", + 'WARDEN BUNGALOW': "Bungalow", + 'WARDEN FLAT': "Flat", + 'EXTRACARE SCHEME': "Flat", + } + } } + outputs = {} for ha_name, data_assets in loader.data.items(): + + if not pull_data: + # Then we retrieve the data from S3 + processed_ha_results = read_pickle_from_s3( + bucket_name="retrofit-datalake-dev", + s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle" + ) + + outputs[ha_name] = { + "results_df": processed_ha_results["results_df"], + "scoring_data": processed_ha_results["scoring_df"], + "nodata": processed_ha_results["nodata"] + } + continue + # For each HA, we read pull in the data required, and store in S3 asset_list = data_assets["asset_list"].copy() @@ -490,8 +551,12 @@ def get_epc_data( # We iterate through the asset list and pull what we need results = [] scoring_data = [] + nodata = [] for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)): + if property_meta["matching_postcode"] is None: + continue + if ha_name == "ha_1": property_type = property_meta["Asset Type"] # We correct a small error @@ -503,6 +568,9 @@ def get_epc_data( property_type = "Flat" built_form = property_type_lookup[ha_name]["built_form"].get(property_meta["Property Type"], None) + elif ha_name == "ha_6": + property_type = property_type_lookup[ha_name]["property_type"][property_meta["Dwelling type"]] + built_form = property_meta["built_form"] else: raise NotImplementedError("Implement me") @@ -517,6 +585,10 @@ def get_epc_data( searcher.ordnance_survey_client.built_form = built_form searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + nodata.append(property_meta) + continue + if searcher.newest_epc.get("estimated"): # We insert the row ID as our proxy for UPRN searcher.newest_epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1]) @@ -606,6 +678,7 @@ def get_epc_data( "cavity_age": cavity_age, **eligibility.walls, **eligibility.roof, + "is_estimated": searcher.newest_epc.get("estimated") is not None } ) @@ -619,6 +692,10 @@ def get_epc_data( model_api = ModelApi(portfolio_id="-".join([ha_name, "eligibility"]), timestamp=created_at) + # scoring_df["is_community"].value_counts() + # scoring_df[scoring_df["is_community"] == "Unknown"] + # property_meta = asset_list[asset_list["asset_list_row_id"] == "ha_67238"].squeeze() + all_predictions = model_api.predict_all( df=scoring_df, bucket="retrofit-data-dev", @@ -678,8 +755,33 @@ def get_epc_data( } ) + eligibility_assessment = pd.DataFrame(eligibility_assessment) -def analyse_ha_data(): + results_df = results_df.merge( + eligibility_assessment, how="left", on="row_id" + ) + + # We store the results in S3 as a pickle + save_pickle_to_s3( + data={ + "results_df": results_df, + "scoring_data": scoring_df, + "nodata": nodata + }, + bucket_name="retrofit-datalake-dev", + s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle" + ) + + outputs[ha_name] = { + "results_df": results_df, + "scoring_data": scoring_df, + "nodata": nodata + } + + return outputs + + +def analyse_ha_data(outputs, loader): """ The approach we take within this function is the following: For properties that have been identified by warmfront as eligible properties, characterise them by scheme. The @@ -697,6 +799,127 @@ def analyse_ha_data(): :return: """ + + for ha_name, datasets in outputs.items(): + + # TODO: This is placeholder because we don't have the schemes that the properties have been qualified for + # yet + # + import random + randomly_allocated_schemes = random.choices(["ECO4", "GBIS"], k=inputs["asset_list"].shape[0]) + inputs["asset_list"]["randomly_allocated_schemes"] = randomly_allocated_schemes + inputs["asset_list"]["funding_scheme"] = None + inputs["asset_list"]["funding_scheme"] = np.where( + inputs["asset_list"]["row_meaning"] == "identified potential eco works (CWI)", + inputs["asset_list"]["randomly_allocated_schemes"], + inputs["asset_list"]["funding_scheme"] + ) + + # End placholder + + results_df = datasets["results_df"].copy() + + inputs = [x for k, x in loader.data.items() if k == ha_name][0] + + analysis_data = inputs["asset_list"][['asset_list_row_id', "row_meaning", "funding_scheme"]].rename( + columns={"row_meaning": "asset_identification_status"} + ).merge( + results_df, + how="left", + right_on="row_id", + left_on="asset_list_row_id" + ) + + # If we have a survey list, we merge this onto the results + + n_properties_in_asset_list = analysis_data["asset_list_row_id"].nunique() + + properties_sold = ( + inputs["survey_list"].groupby("funding_scheme")["survey_list_row_id"].nunique().reset_index() if + inputs["survey_list"] is not None else 0 + ) + properties_sold_eco4 = ( + properties_sold[properties_sold["funding_scheme"] == "ECO4"]["survey_list_row_id"].values[0] if + properties_sold != 0 else 0 + ) + properties_sold_gbis = ( + properties_sold[properties_sold["funding_scheme"] == "GBIS"]["survey_list_row_id"].values[0] if + properties_sold != 0 else 0 + ) + + # We now merge the survey list onto the analysis data and remove anything that is sold, to give us just what is + # remaining + + if inputs["matched_lookup"] is not None: + analysis_data = analysis_data.merge( + inputs["matched_lookup"], how="left", on="asset_list_row_id" + ) + # Drop any rows that have a survey_list_row_id + analysis_data = analysis_data[pd.isnull(analysis_data["survey_list_row_id"])] + + # We now calculate the number of remaining properties, by scheme + # TODO: We might need to tweak a bit of the knowledge + remaining_properties = analysis_data[ + analysis_data["asset_identification_status"] == "identified potential eco works (CWI)" + ] + + remaining_properties_by_scheme = ( + remaining_properties.groupby("funding_scheme")["asset_list_row_id"].nunique().reset_index() + ) + remaining_properties_eco4 = remaining_properties_by_scheme[ + remaining_properties_by_scheme["funding_scheme"] == "ECO4" + ]["asset_list_row_id"].values[0] + + remaining_properties_gbis = remaining_properties_by_scheme[ + remaining_properties_by_scheme["funding_scheme"] == "GBIS" + ]["asset_list_row_id"].values[0] + + # For the remaining properties, we use the results of the eligibility process to classify the property into + # one of multiple categories + # + # For properties that have been identified as ECO4 + # 1) Strict ECO4 candidate - Has required fabric and EPC is below a D + # - This is not the very strictest definition of ECO4 eligible, but we aim to characterise the properties + # here and re-surveying is a common practicce by Warmfront. Additionally, many of the social homes have + # very old EPCs which may score lower when re-done + # 2) Subject to CIGA check - Meets loft conditions but shows a filled cavity. + # - we don't have a SAP constraint here because the EPC is (currently) showing what the property might + # actually look like after retrofit and so the EPC currently being a C or above means little, because + # the updated EPC, showing an empty cavity, could bring the property within + # 3) Loft insulation too thick - Meets empty cavity but shows a loft with between 101 and 270mm insulation. + # - No SAP constraint, for the same reason as in category 2) + # 4) Does not look like ECO4 candidate + # + # For properties that have been identified as GBIS + # 1) Strict GBIS candidates + # 2) Properties that actually look like strict GBIS candidates + # 3) Subject to CIGA check - Filled cavity + # 4) Does not look like a GBIS candidate + + # ECO4 + # 1) We identify this if: + # - remaining_properties["eco4_eligible"] == True + # - remaining_properties[""] + remaining_properties[remaining_properties["eco4_eligible"] == True]["eco4_message"].value_counts() + remaining_properties["eco4_message"].value_counts() + z = remaining_properties[ + (remaining_properties["eco4_message"] == "Possibly eligible but property currently EPC D") & + (remaining_properties["eco4_eligible"] == True) + ] + + k = z[z["property_type"] == "Flat"] + k["uprn"] + + ha_analysis_results = { + "n_properties_in_asset_list": n_properties_in_asset_list, + # ECO4 + "properties_sold_eco4": properties_sold_eco4, + "remaining_properties_eco4": remaining_properties_eco4, + # GBIS + "properties_sold_gbis": properties_sold_gbis, + "remaining_properties_gbis": remaining_properties_gbis + } + pass @@ -789,10 +1012,10 @@ def app(): # Patch mainheatcont-description cleaned["mainheatcont-description"].extend( [ - {'original_description': 'None', 'clean_description': 'None', 'thermostatic_control': False, - 'charging_system': False, 'switch_system': False, 'no_control': False, 'dhw_control': False, - 'community_heating': False, 'multiple_room_thermostats': False, 'auxiliary_systems': False, 'trvs': False, - 'rate_control': False} + {'original_description': 'None', 'clean_description': 'None', 'thermostatic_control': None, + 'charging_system': None, 'switch_system': None, 'no_control': None, 'dhw_control': None, + 'community_heating': None, 'multiple_room_thermostats': False, 'auxiliary_systems': None, 'trvs': None, + 'rate_control': None} ] ) @@ -810,4 +1033,4 @@ def app(): photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev") - get_epc_data(loader) + outputs = get_epc_data(loader) diff --git a/etl/epc/Dataset.py b/etl/epc/Dataset.py index fbc7a2d2..4a159f4b 100644 --- a/etl/epc/Dataset.py +++ b/etl/epc/Dataset.py @@ -11,6 +11,37 @@ from recommendations.recommendation_utils import ( get_wall_type ) +# TODO: Can probably produce this in the property change app and store in S3 +BOOLEAN_VARIABLES = [ + 'is_cavity_wall', 'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame', + 'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_sandstone_or_limestone', 'is_park_home', + 'external_insulation', 'internal_insulation', 'is_park_home_ending', 'external_insulation_ending', + 'internal_insulation_ending', 'is_to_unheated_space', 'is_to_external_air', 'is_suspended', 'is_solid', + 'another_property_below', 'is_pitched', 'is_roof_room', 'is_loft', 'is_flat', 'is_thatched', 'is_at_rafters', + 'has_dwelling_above', 'has_radiators', 'has_fan_coil_units', 'has_pipes_in_screed_above_insulation', + 'has_pipes_in_insulated_timber_floor', 'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump', + 'has_room_heaters', 'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating', + 'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump', 'has_no_system_present', + 'has_portable_electric_heaters', 'has_water_source_heat_pump', 'has_electric_heat_pump', 'has_micro-cogeneration', + 'has_solar_assisted_heat_pump', 'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric', + 'has_mains_gas', 'has_wood_logs', 'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite', + 'has_dual_fuel_mineral_and_wood', 'has_smokeless_fuel', 'has_lpg', 'has_b30k', 'has_electricaire', + 'has_assumed_for_most_rooms', 'has_underfloor_heating', 'has_radiators_ending', 'has_fan_coil_units_ending', + 'has_pipes_in_screed_above_insulation_ending', 'has_pipes_in_insulated_timber_floor_ending', + 'has_pipes_in_concrete_slab_ending', 'has_boiler_ending', 'has_air_source_heat_pump_ending', + 'has_room_heaters_ending', 'has_electric_storage_heaters_ending', 'has_warm_air_ending', + 'has_electric_underfloor_heating_ending', 'has_electric_ceiling_heating_ending', 'has_community_scheme_ending', + 'has_ground_source_heat_pump_ending', 'has_no_system_present_ending', 'has_portable_electric_heaters_ending', + 'has_water_source_heat_pump_ending', 'has_electric_heat_pump_ending', 'has_micro-cogeneration_ending', + 'has_solar_assisted_heat_pump_ending', 'has_exhaust_source_heat_pump_ending', 'has_community_heat_pump_ending', + 'has_electric_ending', 'has_mains_gas_ending', 'has_wood_logs_ending', 'has_coal_ending', 'has_oil_ending', + 'has_wood_pellets_ending', 'has_anthracite_ending', 'has_dual_fuel_mineral_and_wood_ending', + 'has_smokeless_fuel_ending', 'has_lpg_ending', 'has_b30k_ending', 'has_electricaire_ending', + 'has_assumed_for_most_rooms_ending', 'has_underfloor_heating_ending', 'multiple_room_thermostats', + 'multiple_room_thermostats_ending', 'is_community', 'no_individual_heating_or_community_network', + 'is_community_ending', 'no_individual_heating_or_community_network_ending' +] + class BaseDataset: """ @@ -439,7 +470,7 @@ class TrainingDataset(BaseDataset): for col in missings.index: unique_values = self.df[col].unique() - if True in unique_values or False in unique_values: + if (True in unique_values) or (False in unique_values) or (col in BOOLEAN_VARIABLES): self.df[col] = self.df[col].fillna(False) if "none" in unique_values: self.df[col] = self.df[col].fillna("none") diff --git a/etl/epc/settings.py b/etl/epc/settings.py index 33bab190..87f27972 100644 --- a/etl/epc/settings.py +++ b/etl/epc/settings.py @@ -46,6 +46,8 @@ DATA_ANOMALY_MATCHES = { "", # We sometimes find None values - particulatly when we produce an estimated EPC None, + # An older value which rarely shows up but has been seen in the data. + "UNKNOWN", } DATA_ANOMALY_SUBSTRINGS = {