Added booleans to clean missings

2026-07-27 23:35:01 +00:00 · 2024-01-24 21:21:01 +00:00 · 2024-01-24 21:21:01 +00:00 · ef27d6b164
commit ef27d6b164
parent edb541f3dc
6 changed files with 288 additions and 12 deletions
--- a/BaseUtility.py
+++ b/BaseUtility.py
@ -45,7 +45,9 @@ class Definitions:
        # contain a ‘null’ value. A resolution to correct these anomalies will be considered for future data releases.
        "NULL",
        # We sometimes see fields populated with just an empty string.
-        ""
+        "",
+        # An older value which rarely shows up but has been seen in the data.
+        "UNKNOWN",
    }

    DATA_ANOMALY_SUBSTRINGS = {
--- a/backend/Property.py
+++ b/backend/Property.py
@ -13,7 +13,7 @@ from etl.epc_clean.epc_attributes.all_cleaners import all_cleaner_map
 from etl.solar.SolarPhotoSupply import SolarPhotoSupply
 from utils.logger import setup_logger
 from utils.s3 import read_dataframe_from_s3_parquet
-from BaseUtility import Definitions
+from etl.epc.settings import DATA_ANOMALY_MATCHES
 from recommendations.rdsap_tables import england_wales_age_band_lookup, FLOOR_LEVEL_MAP
 from recommendations.recommendation_utils import (
    estimate_perimeter, get_wall_type, estimate_external_wall_area, esimtate_pitched_roof_area, estimate_windows
@ -25,7 +25,7 @@ DATA_BUCKET = os.environ.get('DATA_BUCKET', 'retrofit-data-dev' if ENVIRONMENT =
 logger = setup_logger()


-class Property(Definitions):
+class Property:
    ATTRIBUTE_MAP = {
        "floor-description": "floor",
        "hotwater-description": "hotwater",
@ -51,6 +51,8 @@ class Property(Definitions):
    spatial = None
    base_difference_record = None

+    DATA_ANOMALY_MATCHES = DATA_ANOMALY_MATCHES
+
    def __init__(self, id, postcode, address, epc_record):

        self.epc_record = epc_record
@ -302,6 +304,7 @@ class Property(Definitions):
        self.set_basic_property_dimensions()

        for description, attribute in cleaned.items():
+
            if self.data[description] in self.DATA_ANOMALY_MATCHES:
                template = cleaned[description][0]
                fill_dict = dict(zip(template.keys(), [None] * len(template)))
@ -319,7 +322,7 @@ class Property(Definitions):
            attributes = [
                x for x in cleaned[description] if x["original_description"] == self.data[description]
            ]
-            
+
            if len(attributes) > 1:
                raise ValueError("Either No attributes or multiple found for %s" % description)

--- a/etl/eligibility/Eligibility.py
+++ b/etl/eligibility/Eligibility.py
@ -233,6 +233,13 @@ class Eligibility:
    def room_roof_insulation(self):
        is_room_roof = self.roof["is_roof_room"]

+        if not is_room_roof:
+            self.room_roof = {
+                "suitability": False,
+                "thickness": None
+            }
+            return
+
        insulation_thickness = convert_thickness_to_numeric(
            self.roof["insulation_thickness"],
            self.roof["is_pitched"],
@ -246,6 +253,14 @@ class Eligibility:

    def flat_roof_insulation(self):
        is_flat = self.roof["is_flat"]
+
+        if not is_flat:
+            self.flat_roof = {
+                "suitability": False,
+                "thickness": None
+            }
+            return
+
        insulation_thickness = convert_thickness_to_numeric(
            self.roof["insulation_thickness"],
            self.roof["is_pitched"],
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@ -154,6 +154,10 @@ class DataLoader:

            asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1)

+        # Finally, we process property_type or built form, where needed
+        if ha_name == "ha_6":
+            asset_list["built_form"] = asset_list["Property Type"].apply(self.identify_built_form_ha6)
+
        return asset_list

    def load_survey_list(self, file_path, ha_name, asset_list, sheet_name=None):
@ -412,6 +416,34 @@ class DataLoader:

        return matching_lookup

+    @staticmethod
+    def identify_built_form_ha6(property_string):
+        """
+        Identify the built form of a property from the given string.
+
+        :param property_string: The string describing the property
+        :return: The identified built form, or None if it cannot be identified
+        """
+        # Define keywords for each built form
+        built_forms = {
+            'Semi-Detached': ['semi detached'],
+            'Detached': ['detached'],
+            'Mid-Terrace': ['mid terrace', 'mid town house'],
+            'End-Terrace': ['end terrace', 'end town house']
+        }
+
+        # Normalize the input string to lower case for comparison
+        property_string_normalized = property_string.lower()
+
+        # Search for each built form keyword in the input string
+        for built_form, keywords in built_forms.items():
+            for keyword in keywords:
+                if keyword in property_string_normalized:
+                    return built_form
+
+        # Return None if no built form is identified
+        return None
+
    def load(self):

        if self.use_cache:
@ -461,7 +493,7 @@ class DataLoader:


 def get_epc_data(
-    loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds
+    loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=True
 ):
    if not loader.data:
        raise ValueError("Data not found - please run loader.load() first")
@ -476,10 +508,39 @@ def get_epc_data(
                'Enclosed Mid': 'Mid-Terrace',
                'Detached Local Connect': 'Detached',
            }
+        },
+        "ha_6": {
+            "property_type": {
+                'HOUSE': "House",
+                'GROUND FLOOR FLAT': "Flat",
+                'UPPER FLOOR FLAT': "Flat",
+                'MAISONETTE': "Maisonette",
+                'BUNGALOW': "Bungalow",
+                'WARDEN BUNGALOW': "Bungalow",
+                'WARDEN FLAT': "Flat",
+                'EXTRACARE SCHEME': "Flat",
+            }
+
        }
    }

+    outputs = {}
    for ha_name, data_assets in loader.data.items():
+
+        if not pull_data:
+            # Then we retrieve the data from S3
+            processed_ha_results = read_pickle_from_s3(
+                bucket_name="retrofit-datalake-dev",
+                s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle"
+            )
+
+            outputs[ha_name] = {
+                "results_df": processed_ha_results["results_df"],
+                "scoring_data": processed_ha_results["scoring_df"],
+                "nodata": processed_ha_results["nodata"]
+            }
+            continue
+
        # For each HA, we read pull in the data required, and store in S3
        asset_list = data_assets["asset_list"].copy()

@ -490,8 +551,12 @@ def get_epc_data(
        # We iterate through the asset list and pull what we need
        results = []
        scoring_data = []
+        nodata = []
        for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)):

+            if property_meta["matching_postcode"] is None:
+                continue
+
            if ha_name == "ha_1":
                property_type = property_meta["Asset Type"]
                # We correct a small error
@ -503,6 +568,9 @@ def get_epc_data(
                    property_type = "Flat"

                built_form = property_type_lookup[ha_name]["built_form"].get(property_meta["Property Type"], None)
+            elif ha_name == "ha_6":
+                property_type = property_type_lookup[ha_name]["property_type"][property_meta["Dwelling type"]]
+                built_form = property_meta["built_form"]
            else:
                raise NotImplementedError("Implement me")

@ -517,6 +585,10 @@ def get_epc_data(
            searcher.ordnance_survey_client.built_form = built_form
            searcher.find_property(skip_os=True)

+            if searcher.newest_epc is None:
+                nodata.append(property_meta)
+                continue
+
            if searcher.newest_epc.get("estimated"):
                # We insert the row ID as our proxy for UPRN
                searcher.newest_epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1])
@ -606,6 +678,7 @@ def get_epc_data(
                    "cavity_age": cavity_age,
                    **eligibility.walls,
                    **eligibility.roof,
+                    "is_estimated": searcher.newest_epc.get("estimated") is not None
                }
            )

@ -619,6 +692,10 @@ def get_epc_data(

        model_api = ModelApi(portfolio_id="-".join([ha_name, "eligibility"]), timestamp=created_at)

+        # scoring_df["is_community"].value_counts()
+        # scoring_df[scoring_df["is_community"] == "Unknown"]
+        # property_meta = asset_list[asset_list["asset_list_row_id"] == "ha_67238"].squeeze()
+
        all_predictions = model_api.predict_all(
            df=scoring_df,
            bucket="retrofit-data-dev",
@ -678,8 +755,33 @@ def get_epc_data(
                }
            )

+        eligibility_assessment = pd.DataFrame(eligibility_assessment)

-def analyse_ha_data():
+        results_df = results_df.merge(
+            eligibility_assessment, how="left", on="row_id"
+        )
+
+        # We store the results in S3 as a pickle
+        save_pickle_to_s3(
+            data={
+                "results_df": results_df,
+                "scoring_data": scoring_df,
+                "nodata": nodata
+            },
+            bucket_name="retrofit-datalake-dev",
+            s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle"
+        )
+
+        outputs[ha_name] = {
+            "results_df": results_df,
+            "scoring_data": scoring_df,
+            "nodata": nodata
+        }
+
+    return outputs
+
+
+def analyse_ha_data(outputs, loader):
    """
    The approach we take within this function is the following:
    For properties that have been identified by warmfront as eligible properties, characterise them by scheme. The
@ -697,6 +799,127 @@ def analyse_ha_data():

    :return:
    """
+
+    for ha_name, datasets in outputs.items():
+
+        # TODO: This is placeholder because we don't have the schemes that the properties have been qualified for
+        #       yet
+        #
+        import random
+        randomly_allocated_schemes = random.choices(["ECO4", "GBIS"], k=inputs["asset_list"].shape[0])
+        inputs["asset_list"]["randomly_allocated_schemes"] = randomly_allocated_schemes
+        inputs["asset_list"]["funding_scheme"] = None
+        inputs["asset_list"]["funding_scheme"] = np.where(
+            inputs["asset_list"]["row_meaning"] == "identified potential eco works (CWI)",
+            inputs["asset_list"]["randomly_allocated_schemes"],
+            inputs["asset_list"]["funding_scheme"]
+        )
+
+        # End placholder
+
+        results_df = datasets["results_df"].copy()
+
+        inputs = [x for k, x in loader.data.items() if k == ha_name][0]
+
+        analysis_data = inputs["asset_list"][['asset_list_row_id', "row_meaning", "funding_scheme"]].rename(
+            columns={"row_meaning": "asset_identification_status"}
+        ).merge(
+            results_df,
+            how="left",
+            right_on="row_id",
+            left_on="asset_list_row_id"
+        )
+
+        # If we have a survey list, we merge this onto the results
+
+        n_properties_in_asset_list = analysis_data["asset_list_row_id"].nunique()
+
+        properties_sold = (
+            inputs["survey_list"].groupby("funding_scheme")["survey_list_row_id"].nunique().reset_index() if
+            inputs["survey_list"] is not None else 0
+        )
+        properties_sold_eco4 = (
+            properties_sold[properties_sold["funding_scheme"] == "ECO4"]["survey_list_row_id"].values[0] if
+            properties_sold != 0 else 0
+        )
+        properties_sold_gbis = (
+            properties_sold[properties_sold["funding_scheme"] == "GBIS"]["survey_list_row_id"].values[0] if
+            properties_sold != 0 else 0
+        )
+
+        # We now merge the survey list onto the analysis data and remove anything that is sold, to give us just what is
+        # remaining
+
+        if inputs["matched_lookup"] is not None:
+            analysis_data = analysis_data.merge(
+                inputs["matched_lookup"], how="left", on="asset_list_row_id"
+            )
+            # Drop any rows that have a survey_list_row_id
+            analysis_data = analysis_data[pd.isnull(analysis_data["survey_list_row_id"])]
+
+        # We now calculate the number of remaining properties, by scheme
+        # TODO: We might need to tweak a bit of the knowledge
+        remaining_properties = analysis_data[
+            analysis_data["asset_identification_status"] == "identified potential eco works (CWI)"
+            ]
+
+        remaining_properties_by_scheme = (
+            remaining_properties.groupby("funding_scheme")["asset_list_row_id"].nunique().reset_index()
+        )
+        remaining_properties_eco4 = remaining_properties_by_scheme[
+            remaining_properties_by_scheme["funding_scheme"] == "ECO4"
+            ]["asset_list_row_id"].values[0]
+
+        remaining_properties_gbis = remaining_properties_by_scheme[
+            remaining_properties_by_scheme["funding_scheme"] == "GBIS"
+            ]["asset_list_row_id"].values[0]
+
+        # For the remaining properties, we use the results of the eligibility process to classify the property into
+        # one of multiple categories
+        #
+        # For properties that have been identified as ECO4
+        # 1) Strict ECO4 candidate - Has required fabric and EPC is below a D
+        #    - This is not the very strictest definition of ECO4 eligible, but we aim to characterise the properties
+        #      here and re-surveying is a common practicce by Warmfront. Additionally, many of the social homes have
+        #      very old EPCs which may score lower when re-done
+        # 2) Subject to CIGA check - Meets loft conditions but shows a filled cavity.
+        #    - we don't have a SAP constraint here because the EPC is (currently) showing what the property might
+        #      actually look like after retrofit and so the EPC currently being a C or above means little, because
+        #      the updated EPC, showing an empty cavity, could bring the property within
+        # 3) Loft insulation too thick - Meets empty cavity but shows a loft with between 101 and 270mm insulation.
+        #   - No SAP constraint, for the same reason as in category 2)
+        # 4) Does not look like ECO4 candidate
+        #
+        # For properties that have been identified as GBIS
+        # 1) Strict GBIS candidates
+        # 2) Properties that actually look like strict GBIS candidates
+        # 3) Subject to CIGA check - Filled cavity
+        # 4) Does not look like a GBIS candidate
+
+        # ECO4
+        # 1) We identify this if:
+        #   - remaining_properties["eco4_eligible"] == True
+        #   - remaining_properties[""]
+        remaining_properties[remaining_properties["eco4_eligible"] == True]["eco4_message"].value_counts()
+        remaining_properties["eco4_message"].value_counts()
+        z = remaining_properties[
+            (remaining_properties["eco4_message"] == "Possibly eligible but property currently EPC D") &
+            (remaining_properties["eco4_eligible"] == True)
+            ]
+
+        k = z[z["property_type"] == "Flat"]
+        k["uprn"]
+
+        ha_analysis_results = {
+            "n_properties_in_asset_list": n_properties_in_asset_list,
+            # ECO4
+            "properties_sold_eco4": properties_sold_eco4,
+            "remaining_properties_eco4": remaining_properties_eco4,
+            # GBIS
+            "properties_sold_gbis": properties_sold_gbis,
+            "remaining_properties_gbis": remaining_properties_gbis
+        }
+
    pass


@ -789,10 +1012,10 @@ def app():
    # Patch mainheatcont-description
    cleaned["mainheatcont-description"].extend(
        [
-            {'original_description': 'None', 'clean_description': 'None', 'thermostatic_control': False,
-             'charging_system': False, 'switch_system': False, 'no_control': False, 'dhw_control': False,
-             'community_heating': False, 'multiple_room_thermostats': False, 'auxiliary_systems': False, 'trvs': False,
-             'rate_control': False}
+            {'original_description': 'None', 'clean_description': 'None', 'thermostatic_control': None,
+             'charging_system': None, 'switch_system': None, 'no_control': None, 'dhw_control': None,
+             'community_heating': None, 'multiple_room_thermostats': False, 'auxiliary_systems': None, 'trvs': None,
+             'rate_control': None}
        ]
    )

@ -810,4 +1033,4 @@ def app():

    photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev")

-    get_epc_data(loader)
+    outputs = get_epc_data(loader)
--- a/etl/epc/Dataset.py
+++ b/etl/epc/Dataset.py
@ -11,6 +11,37 @@ from recommendations.recommendation_utils import (
    get_wall_type
 )

+# TODO: Can probably produce this in the property change app and store in S3
+BOOLEAN_VARIABLES = [
+    'is_cavity_wall', 'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame',
+    'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_sandstone_or_limestone', 'is_park_home',
+    'external_insulation', 'internal_insulation', 'is_park_home_ending', 'external_insulation_ending',
+    'internal_insulation_ending', 'is_to_unheated_space', 'is_to_external_air', 'is_suspended', 'is_solid',
+    'another_property_below', 'is_pitched', 'is_roof_room', 'is_loft', 'is_flat', 'is_thatched', 'is_at_rafters',
+    'has_dwelling_above', 'has_radiators', 'has_fan_coil_units', 'has_pipes_in_screed_above_insulation',
+    'has_pipes_in_insulated_timber_floor', 'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump',
+    'has_room_heaters', 'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating',
+    'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump', 'has_no_system_present',
+    'has_portable_electric_heaters', 'has_water_source_heat_pump', 'has_electric_heat_pump', 'has_micro-cogeneration',
+    'has_solar_assisted_heat_pump', 'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric',
+    'has_mains_gas', 'has_wood_logs', 'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite',
+    'has_dual_fuel_mineral_and_wood', 'has_smokeless_fuel', 'has_lpg', 'has_b30k', 'has_electricaire',
+    'has_assumed_for_most_rooms', 'has_underfloor_heating', 'has_radiators_ending', 'has_fan_coil_units_ending',
+    'has_pipes_in_screed_above_insulation_ending', 'has_pipes_in_insulated_timber_floor_ending',
+    'has_pipes_in_concrete_slab_ending', 'has_boiler_ending', 'has_air_source_heat_pump_ending',
+    'has_room_heaters_ending', 'has_electric_storage_heaters_ending', 'has_warm_air_ending',
+    'has_electric_underfloor_heating_ending', 'has_electric_ceiling_heating_ending', 'has_community_scheme_ending',
+    'has_ground_source_heat_pump_ending', 'has_no_system_present_ending', 'has_portable_electric_heaters_ending',
+    'has_water_source_heat_pump_ending', 'has_electric_heat_pump_ending', 'has_micro-cogeneration_ending',
+    'has_solar_assisted_heat_pump_ending', 'has_exhaust_source_heat_pump_ending', 'has_community_heat_pump_ending',
+    'has_electric_ending', 'has_mains_gas_ending', 'has_wood_logs_ending', 'has_coal_ending', 'has_oil_ending',
+    'has_wood_pellets_ending', 'has_anthracite_ending', 'has_dual_fuel_mineral_and_wood_ending',
+    'has_smokeless_fuel_ending', 'has_lpg_ending', 'has_b30k_ending', 'has_electricaire_ending',
+    'has_assumed_for_most_rooms_ending', 'has_underfloor_heating_ending', 'multiple_room_thermostats',
+    'multiple_room_thermostats_ending', 'is_community', 'no_individual_heating_or_community_network',
+    'is_community_ending', 'no_individual_heating_or_community_network_ending'
+]
+

 class BaseDataset:
    """
@ -439,7 +470,7 @@ class TrainingDataset(BaseDataset):

        for col in missings.index:
            unique_values = self.df[col].unique()
-            if True in unique_values or False in unique_values:
+            if (True in unique_values) or (False in unique_values) or (col in BOOLEAN_VARIABLES):
                self.df[col] = self.df[col].fillna(False)
            if "none" in unique_values:
                self.df[col] = self.df[col].fillna("none")
--- a/etl/epc/settings.py
+++ b/etl/epc/settings.py
@ -46,6 +46,8 @@ DATA_ANOMALY_MATCHES = {
    "",
    # We sometimes find None values - particulatly when we produce an estimated EPC
    None,
+    # An older value which rarely shows up but has been seen in the data.
+    "UNKNOWN",
 }

 DATA_ANOMALY_SUBSTRINGS = {