Merge pull request #287 from Hestia-Homes/ha-analysis-3

Ha analysis 3
2026-07-27 23:35:01 +00:00 · 2024-03-26 18:05:46 +00:00 · 2024-03-26 18:05:46 +00:00 · c81b03c458
commit c81b03c458
parent 444c7c22c3 724379a86d
13 changed files with 6431 additions and 887 deletions
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -1,3 +1,5 @@
 # Default ignored files
 /shelf/
 /workspace.xml
+# GitHub Copilot persisted chat sessions
+/copilot/chatSessions
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@ -7,7 +7,7 @@
      <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
      <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
    </content>
-    <orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
  <component name="PyNamespacePackagesService">
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -3,7 +3,7 @@
  <component name="Black">
    <option name="sdkName" value="Python 3.10 (backend)" />
  </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
  <component name="PythonCompatibilityInspectionAdvertiser">
    <option name="version" value="3" />
  </component>
--- a/backend/Property.py
+++ b/backend/Property.py
@ -147,7 +147,8 @@ class Property:
        # self.base_difference_record.df

    def adjust_difference_record_with_recommendations(
-        self, property_recommendations,
+        self,
+        property_recommendations,
        property_representative_recommendations
    ):
        """
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@ -30,7 +30,7 @@ vartypes = {
    'environment-impact-potential': "Int64",
    'glazed-type': 'str',
    'heating-cost-current': 'float',
-    'address3': 'str',
+    # 'address3': 'str',
    'mainheatcont-description': 'str',
    'sheating-energy-eff': 'str',
    'property-type': 'str',
@ -40,7 +40,7 @@ vartypes = {
    'mechanical-ventilation': 'str',
    'hot-water-cost-current': 'str',
    'county': 'str',
-    'postcode': 'str',
+    # 'postcode': 'str',
    'solar-water-heating-flag': 'str',
    'constituency': 'str',
    'co2-emissions-potential': 'float',
@ -55,7 +55,7 @@ vartypes = {
    # 'inspection-date': str,
    'mains-gas-flag': 'str',
    'co2-emiss-curr-per-floor-area': 'float',
-    'address1': 'str',
+    # 'address1': 'str',
    'heat-loss-corridor': 'str',
    'flat-storey-count': "Int64",
    'constituency-label': 'str',
@ -67,7 +67,7 @@ vartypes = {
    'roof-description': 'str',
    'floor-energy-eff': 'str',
    'number-habitable-rooms': 'float',
-    'address2': 'str',
+    # 'address2': 'str',
    'hot-water-env-eff': 'str',
    'posttown': 'str',
    'mainheatc-energy-eff': 'str',
@ -98,7 +98,7 @@ vartypes = {
    # 'lodgement-date',
    'extension-count': "Int64",
    'mainheatc-env-eff': 'str',
-    'lmk-key': 'str',
+    # 'lmk-key': 'str',
    'wind-turbine-count': "Int64",
    'tenure': 'str',
    'floor-level': 'str',
@ -147,6 +147,7 @@ class SearchEpc:
        uprn: [int, None] = None,
        size=None,
        property_type=None,
+        fast=False
    ):
        """
        Address lines 1 and postcode are mandatory fields. The other address lines are optional
@ -187,6 +188,7 @@ class SearchEpc:
        self.size = size if size is not None else 25

        self.property_type = property_type
+        self.fast = fast

    @classmethod
    def get_house_number(cls, address: str) -> str | None:
@ -365,9 +367,6 @@ class SearchEpc:
        # Finally, we identify the newest epc and the rest, and then return
        newest_epc, older_epcs = self.filter_newest_epc(list_of_epcs=rows)

-        # Retrieve postcode and address
-        address_epc, postcode_epc = self.format_address(newest_epc=newest_epc)
-
        # Ge the uprn from the newest record for this home
        uprns = {r["uprn"] for r in rows if r["uprn"]}
        # We can sometimes have no uprn for a property
@ -384,6 +383,12 @@ class SearchEpc:

        uprn = uprns.pop() if uprns else None

+        if self.fast:
+            return newest_epc, [], {}, "", "", None
+
+        # Retrieve postcode and address
+        address_epc, postcode_epc = self.format_address(newest_epc=newest_epc)
+
        return newest_epc, older_epcs, full_sap_epc, address_epc, postcode_epc, uprn

    @staticmethod
@ -575,6 +580,11 @@ class SearchEpc:
            property_type=property_type
        )

+        # If we have missing lodgment date, we fill it with inspection-date
+        epc_data["lodgement-datetime"] = epc_data["lodgement-datetime"].fillna(epc_data["inspection-date"])
+        # If we still have missing dates, we set it to the mean of the non NA dates
+        epc_data["lodgement-datetime"] = epc_data["lodgement-datetime"].fillna(epc_data["lodgement-datetime"].mean())
+
        # For each attribute, we need to determine the datatype and use an appropriate method
        # to estimate.
        estimated_epc = {}
@ -609,7 +619,11 @@ class SearchEpc:
        # Insert an estimated lodgement datetime, with a weighted average
        estimated_epc["lodgement-datetime"] = self.calculate_weighted_lodgement_datetime(epc_data=epc_data)
        # Extract logement date
-        estimated_epc["lodgement-date"] = estimated_epc["lodgement-datetime"].strftime("%Y-%m-%d")
+        # It is possible that there is still no lodgement date, so we need to handle this
+        if pd.isnull(estimated_epc["lodgement-datetime"]):
+            estimated_epc["lodgement-date"] = None
+        else:
+            estimated_epc["lodgement-date"] = estimated_epc["lodgement-datetime"].strftime("%Y-%m-%d")

        estimated_epc["postcode"] = self.postcode
        estimated_epc["uprn"] = self.uprn
--- a/etl/eligibility/Eligibility.py
+++ b/etl/eligibility/Eligibility.py
@ -145,6 +145,7 @@ class Eligibility:
                "reason": None,
                "thickness_classification": thickness_classification
            }
+            return

        # Insulation is already thick enough
        self.loft = {
@ -164,8 +165,10 @@ class Eligibility:
        """

        is_cavity = self.walls["is_cavity_wall"]
-        is_empty = (not self.walls["is_filled_cavity"]) or (
+        is_empty = (not self.walls["is_filled_cavity"])
+        is_as_built = (
            self.walls["is_as_built"] and self.walls["insulation_thickness"] not in ["average", "above average"]
+            and self.walls["is_assumed"]
        )
        is_partial_filled = "partial" in self.walls["clean_description"].lower()
        # We look for potentially under performing cavities - anything that is assumed, as built and insulated
@ -175,6 +178,7 @@ class Eligibility:

        is_unfilled_cavity = is_cavity and (is_empty and not is_partial_filled)
        is_partial_filled_cavity = is_cavity and is_partial_filled
+        is_assumed_filled_cavity = is_cavity and is_as_built
        is_underperforming_cavity = is_cavity and is_underperforming

        # Check if it has internal or external wall insulation
@ -195,6 +199,13 @@ class Eligibility:
            }
            return

+        if is_assumed_filled_cavity:
+            self.cavity = {
+                "suitability": True,
+                "type": "as built assumed",
+            }
+            return
+
        if is_partial_filled_cavity:
            self.cavity = {
                "suitability": True,
@ -340,13 +351,35 @@ class Eligibility:

        # Check if the property is suitable for cavity wall
        self.cavity_insulation()
-        self.loft_insulation()

-        self.gbis_warmfront = (self.cavity["suitability"]) and (
-            int(self.epc["current-energy-efficiency"]) <= 68
-        )
+        current_sap = int(self.epc["current-energy-efficiency"])
+        # We have a strict suitability check and a non-strict check

-    def check_eco4_warmfront(self, post_retrofit_sap=None):
+        # Perfect strictness
+        if (self.cavity["type"] == "empty") and (current_sap < 69):
+            self.gbis_warmfront = {
+                "eligible": True,
+                "strict": True,
+                "message": "Perfect suitability",
+            }
+            return
+
+        # Near perfect
+        if self.cavity["suitability"] and (current_sap < 69):
+            self.gbis_warmfront = {
+                "eligible": True,
+                "strict": True,
+                "message": "Near perfect suitability",
+            }
+            return
+
+        self.gbis_warmfront = {
+            "eligible": False,
+            "strict": False,
+            "message": "All conditions fail",
+        }
+
+    def check_eco4_warmfront(self):
        """
        This funciton will check if the property is eligible for funding under the ECO4 scheme

@ -378,49 +411,121 @@ class Eligibility:
        self.cavity_insulation()
        self.loft_insulation()

-        # make sure conditions 2 and 3 are true
-        is_eligible = self.cavity["suitability"] & self.loft["suitability"]
+        # We put in a placeholder when the roof is not a loft
+        if self.loft["reason"] == "roof not loft":
+            self.loft["thickness"] = 999

-        if current_sap >= 69:
+        # Case 1: No conditions meet
+        if not self.cavity["suitability"] and (self.loft["thickness"] > 100) and current_sap >= 55:
            self.eco4_warmfront = {
                "eligible": False,
-                "message": "sap too high",
+                "strict": False,
+                "message": "All conditions fail",
                "cavity_type": self.cavity["type"],
                "loft_type": self.loft["thickness_classification"]
            }
            return

-        if post_retrofit_sap is None:
-
-            if current_sap >= 55:
-                message = "Possibly eligible but property currently EPC D"
-            else:
-                message = "subject to post retrofit sap" if is_eligible else "not eligible"
-
-            # Update the message to flag properties that failed just because of a full cavity.
-            # We need to double check that the wall is a cavity, that the loft is suitable and that the
-            # sap is within reason
-            # We can then estimate the age of the cavity fill
-            if not is_eligible and (current_sap < 69) and self.loft["suitability"] and self.walls["is_cavity_wall"]:
-                message = "Failed due to full cavity - check cavity age"
-
+        # Case 2 - perfect match
+        if (self.cavity["type"] == "empty") and (self.loft["thickness"] <= 100) and (current_sap < 55):
            self.eco4_warmfront = {
-                "eligible": is_eligible,
-                "message": message,
+                "eligible": True,
+                "strict": True,
+                "message": "Perfect suitability",
                "cavity_type": self.cavity["type"],
                "loft_type": self.loft["thickness_classification"]
            }
            return

-        is_eligible = is_eligible & (post_retrofit_sap >= 69)
+        # Case 2.5 - near perfect match - but we would not recommend this using the model
+        if self.cavity["suitability"] and (self.loft["thickness"] <= 100) and (current_sap < 55):
+            self.eco4_warmfront = {
+                "eligible": True,
+                "strict": True,
+                "message": "Near perfect suitability",
+                "cavity_type": self.cavity["type"],
+                "loft_type": self.loft["thickness_classification"]
+            }
+            return

-        self.eco4_warmfront = {
-            "eligible": is_eligible,
-            "message": None,
-            "cavity_type": self.cavity["type"],
-            "loft_type": self.loft["thickness_classification"]
-        }
-        return
+        # Case 3 - cavity is suitable, loft is within 150mm, sap is good
+        if self.cavity["suitability"] and (self.loft["thickness"] <= 150) and (current_sap < 55):
+            self.eco4_warmfront = {
+                "eligible": True,
+                "strict": False,
+                "message": "Meets cavity, loft borderline, meets sap",
+                "cavity_type": self.cavity["type"],
+                "loft_type": self.loft["thickness_classification"]
+            }
+            return
+
+        # Case 3 - cavity is suitable, loft is not, sap is good
+        if self.cavity["suitability"] and (self.loft["thickness"] > 150) and (current_sap < 55):
+            self.eco4_warmfront = {
+                "eligible": True,
+                "strict": False,
+                "message": "Meets cavity and sap",
+                "cavity_type": self.cavity["type"],
+                "loft_type": self.loft["thickness_classification"]
+            }
+            return
+
+        # Case 4 - cavity is not suitable, loft is, sap is not - we say this is not elifible
+        if not self.cavity["suitability"] and (self.loft["thickness"] <= 100) and (current_sap < 55):
+            self.eco4_warmfront = {
+                "eligible": False,
+                "strict": False,
+                "message": "failed fabric check",
+                "cavity_type": self.cavity["type"],
+                "loft_type": self.loft["thickness_classification"]
+            }
+            return
+
+        # Case 5 - cavity and loft suitable, sap too high
+        if self.cavity["suitability"] and (self.loft["thickness"] <= 150) and (current_sap >= 55):
+            self.eco4_warmfront = {
+                "eligible": True,
+                "strict": False,
+                "message": "Meets fabric, fails SAP check",
+                "cavity_type": self.cavity["type"],
+                "loft_type": self.loft["thickness_classification"]
+            }
+            return
+
+        # Case 6 - meets just cavity
+        if self.cavity["suitability"] and (self.loft["thickness"] > 100) and (current_sap >= 55):
+            self.eco4_warmfront = {
+                "eligible": True,
+                "strict": False,
+                "message": "Meets just cavity",
+                "cavity_type": self.cavity["type"],
+                "loft_type": self.loft["thickness_classification"]
+            }
+            return
+
+        # Case 7 - fails cavity, loft but meets sap
+        if not self.cavity["suitability"] and (self.loft["thickness"] > 100) and (current_sap < 55):
+            self.eco4_warmfront = {
+                "eligible": False,
+                "strict": False,
+                "message": "Fails cavity and loft, meets SAP",
+                "cavity_type": self.cavity["type"],
+                "loft_type": self.loft["thickness_classification"]
+            }
+            return
+
+        # Case 8 - fails cavity, meets loft, fails sap
+        if not self.cavity["suitability"] and (self.loft["thickness"] <= 100) and (current_sap >= 55):
+            self.eco4_warmfront = {
+                "eligible": False,
+                "strict": False,
+                "message": "Fails cavity, meets loft, fails SAP",
+                "cavity_type": self.cavity["type"],
+                "loft_type": self.loft["thickness_classification"]
+            }
+            return
+
+        raise ValueError("Implement me")

    def check_gbis(self):

--- a/etl/eligibility/ha_15_32/app.py
+++ b/etl/eligibility/ha_15_32/app.py
@ -387,17 +387,19 @@ def prepare_model_data_row(
    }

    simulations = [
-        [cavity_simulation],
-        [loft_simulation]
+        cavity_simulation,
+        loft_simulation
    ]

-    p.adjust_difference_record_with_recommendations(simulations)
+    recommendation_record = p.base_difference_record.df.to_dict("records")[0].copy()
+    scoring_dict = p.create_recommendation_scoring_data(
+        property_id=p.id,
+        recommendation_record=recommendation_record,
+        recommendations=simulations,
+        primary_recommendation_id=cavity_simulation["recommendation_id"]
+    )

-    # Make sure we definitely have the correct data
-    cavity_scoring = [x for x in p.recommendations_scoring_data if "cavity" in x["id"]][0]
-    loft_scoring = [x for x in p.recommendations_scoring_data if "loft" in x["id"]][0]
-
-    return [cavity_scoring, loft_scoring]
+    return [scoring_dict]


 def get_ha_32data(ha_data, cleaned, cleaning_data, created_at):
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
--- a/etl/epc/Dataset.py
+++ b/etl/epc/Dataset.py
@ -203,11 +203,11 @@ class TrainingDataset(BaseDataset):
        common_cols = [[col + "_starting", col + "_ending"] for col in common_cols]

        self.df = self.df.loc[
-            :,
-            no_suffix_cols
-            + only_ending_cols
-            + [col for cols in common_cols for col in cols],
-        ]
+                  :,
+                  no_suffix_cols
+                  + only_ending_cols
+                  + [col for cols in common_cols for col in cols],
+                  ]

    def _remove_abnormal_change_in_floor_area(self):
        """
@ -509,7 +509,7 @@ class TrainingDataset(BaseDataset):
                    expanded_df["is_sandstone_or_limestone"]
                    == expanded_df["is_sandstone_or_limestone_ending"]
                )
-            ]
+                ]
        elif component == "floor":
            expanded_df = expanded_df[
                (expanded_df["is_suspended"] == expanded_df["is_suspended_ending"])
@ -526,7 +526,7 @@ class TrainingDataset(BaseDataset):
                    expanded_df["is_to_external_air"]
                    == expanded_df["is_to_external_air_ending"]
                )
-            ]
+                ]
        elif component == "roof":
            expanded_df = expanded_df[
                (expanded_df["is_pitched"] == expanded_df["is_pitched_ending"])
@ -539,7 +539,7 @@ class TrainingDataset(BaseDataset):
                    expanded_df["has_dwelling_above"]
                    == expanded_df["has_dwelling_above_ending"]
                )
-            ]
+                ]

        return expanded_df

--- a/etl/epc/Record.py
+++ b/etl/epc/Record.py
@ -725,26 +725,26 @@ class EPCRecord:
        if self.prepared_epc["construction-age-band"] in DATA_ANOMALY_MATCHES:
            if self.old_data:
                # Take the most recent
-                max_datetime = max(
-                    [
-                        old_record["lodgement-datetime"]
-                        for old_record in self.old_data
-                        if old_record["construction-age-band"]
-                           not in DATA_ANOMALY_MATCHES
-                    ]
-                )
-
-                most_recent = [
-                    old_record
+                old_age_bands = [
+                    old_record["lodgement-datetime"]
                    for old_record in self.old_data
-                    if old_record["lodgement-datetime"] == max_datetime
+                    if old_record["construction-age-band"] not in DATA_ANOMALY_MATCHES
                ]

-                self.prepared_epc["construction-age-band"] = (
-                    EPCDataProcessor.clean_construction_age_band(
-                        most_recent[0]["construction-age-band"]
+                if old_age_bands:
+                    max_datetime = max(old_age_bands)
+
+                    most_recent = [
+                        old_record
+                        for old_record in self.old_data
+                        if old_record["lodgement-datetime"] == max_datetime
+                    ]
+
+                    self.prepared_epc["construction-age-band"] = (
+                        EPCDataProcessor.clean_construction_age_band(
+                            most_recent[0]["construction-age-band"]
+                        )
                    )
-                )

        self.construction_age_band = self.prepared_epc["construction-age-band"]
        self.age_band = england_wales_age_band_lookup.get(self.construction_age_band)
--- a/etl/epc_clean/app.py
+++ b/etl/epc_clean/app.py
@ -36,8 +36,11 @@ def app():
    cleaned_data = {}
    epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]

+    WALLS = []
    for directory in tqdm(epc_directories):
        data = pd.read_csv(directory / "certificates.csv", low_memory=False)
+        z = data["WALLS_DESCRIPTION"].unique().tolist()
+        WALLS.extend(z)
        # Rename the columns to the same format as the api returns
        data.columns = [c.replace("_", "-").lower() for c in data.columns]
        # Take just date before the date threshold
--- a/etl/epc_clean/epc_attributes/RoofAttributes.py
+++ b/etl/epc_clean/epc_attributes/RoofAttributes.py
@ -122,6 +122,13 @@ class RoofAttributes(Definitions):
        result["is_valid"] = "invalid" not in description
        description = description.replace("invalid", "")

+        # We handle an edge case where the description is "pitched, 150  loft insulation" and is missing the mm
+        if result["is_pitched"] or result["is_loft"]:
+            # Search for a regular expression that matches 150   insulation
+            match = re.search(r"(\d+\+?)\s*insulation", description)
+            if match:
+                result['insulation_thickness'] = match.group(1)
+
        # insulation thickness
        thickness_map = {
            "ceiling insulated": "average",
@ -137,11 +144,11 @@ class RoofAttributes(Definitions):
                # Remove the match from the description
                # description = description.replace(key, "")
                break
-        else:
-            # Extract insulation thickness in mm, if present
-            match = re.search(r'(\d+\+?)\s*mm', description)
-            if match:
-                result['insulation_thickness'] = match.group(1)
+
+        # Extract insulation thickness in mm, if present
+        match = re.search(r'(\d+\+?)\s*mm', description)
+        if match:
+            result['insulation_thickness'] = match.group(1)

        if "insulation_thickness" not in result:
            result['insulation_thickness'] = None
--- a/utils/s3.py
+++ b/utils/s3.py
@ -184,7 +184,7 @@ def read_pickle_from_s3(bucket_name, s3_file_name):
        logger.errpr("Incomplete credentials provided.")
        return None
    except Exception as e:
-        logger.errpr(f'Failed to download data from {bucket_name}/{s3_file_name}: {str(e)}')
+        logger.error(f'Failed to download data from {bucket_name}/{s3_file_name}: {str(e)}')
        return None

    # Deserialize data from pickle format