making fixes to eligibility pipeline with updates to property class

2026-07-27 23:35:01 +00:00 · 2024-01-06 17:59:03 +00:00 · 2024-01-06 17:59:03 +00:00 · ac556d5507
commit ac556d5507
parent 9c94123366
4 changed files with 39 additions and 26 deletions
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@ -458,7 +458,7 @@ class SearchEpc:

                if not epc_data.empty:
                    # Further processing of the EPC data
-                    epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime'])
+                    epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime'], format='mixed')
                    epc_data = epc_data.sort_values("lodgement-datetime", ascending=False).groupby("uprn").head(1)
                    epc_data["house_number"] = epc_data["address"].apply(lambda add1: self.get_house_number(add1))
                    epc_data["numeric_house_number"] = epc_data["house_number"].apply(
@ -646,7 +646,7 @@ class SearchEpc:

        return agg[key].values[0]

-    def find_property(self):
+    def find_property(self, skip_os=False):
        """
        This method will attempt to identify a property. It will, at first, use the EPC api to try and
        find the EPC for the property and the associated UPRN. If this fails, it will use the Ordnance Survey API to
@ -669,6 +669,9 @@ class SearchEpc:
            return

        # Step 2: If we don't have an EPC, we use the ordnance survey api to find the uprn
+        if skip_os:
+            return
+
        os_response = self.ordnance_survey_client.get_places_api()

        if os_response["status"] != 200:
--- a/etl/eligibility/ha_15_32/app.py
+++ b/etl/eligibility/ha_15_32/app.py
@ -11,13 +11,12 @@ import numpy as np
 import msgpack
 from datetime import datetime, timedelta
 from utils.logger import setup_logger
-from utils.s3 import read_from_s3
+from utils.s3 import read_from_s3, read_dataframe_from_s3_parquet
 from dotenv import load_dotenv
 from backend.SearchEpc import SearchEpc
 from backend.Property import Property
 from etl.eligibility.Eligibility import Eligibility
 from etl.epc.DataProcessor import DataProcessor
-from backend.app.utils import read_parquet_from_s3
 from backend.app.plan.utils import create_recommendation_scoring_data
 from etl.epc.settings import COLUMNS_TO_MERGE_ON
 from backend.ml_models.api import ModelApi
@ -348,14 +347,13 @@ def prepare_model_data_row(
    p = Property(
        id=property_id,
        postcode=modelling_epc["postcode"],
-        address1=modelling_epc["address1"],
-        epc_client=None,
-        data=modelling_epc
+        address=modelling_epc["address1"],
+        data=modelling_epc,
+        old_data=old_data,
+        full_sap_epc=full_sap_epc
    )
-    p.old_data = old_data
-    p.full_sap_epc = full_sap_epc

-    p.get_components(cleaned)
+    p.get_components(cleaned, None, None)
    # This is temp - this should happen after scoring
    cleaned_property_data = DataProcessor.apply_averages_cleaning(
        data_to_clean=pd.DataFrame([dict(**p.get_model_data(), LOCAL_AUTHORITY=p.data["local-authority"])]),
@ -1087,7 +1085,7 @@ def app():
    )
    cleaned = msgpack.unpackb(cleaned, raw=False)

-    cleaning_data = read_parquet_from_s3(
+    cleaning_data = read_dataframe_from_s3_parquet(
        bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
    )

--- a/etl/eligibility/ha_15_32/ha7_app.py
+++ b/etl/eligibility/ha_15_32/ha7_app.py
@ -1,3 +1,4 @@
+import os
 import msgpack
 import openpyxl
 from openpyxl.styles.colors import COLOR_INDEX
@ -5,10 +6,9 @@ from pathlib import Path
 from datetime import datetime
 import pandas as pd
 import numpy as np
-from utils.s3 import read_from_s3
+from utils.s3 import read_from_s3, read_dataframe_from_s3_parquet
 from utils.logger import setup_logger
 from dotenv import load_dotenv
-from backend.app.utils import read_parquet_from_s3
 from tqdm import tqdm
 from backend.SearchEpc import SearchEpc
 from etl.eligibility.Eligibility import Eligibility
@ -17,13 +17,14 @@ from etl.epc.DataProcessor import DataProcessor
 from etl.epc.settings import COLUMNS_TO_MERGE_ON
 from backend.ml_models.api import ModelApi

-import re
-
 ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"

 logger = setup_logger()
 load_dotenv(ENV_FILE)

+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+OS_API_KEY = os.getenv("ORDNANCE_SURVEY_API_KEY")
+

 def load_data():
    """
@ -79,20 +80,27 @@ def get_ha7_data(data, cleaned, cleaning_data, created_at):
    nodata = []
    for _, house in tqdm(data.iterrows(), total=len(data)):

+        if house["Address"] is not None:
+            address = house["Address"]
+        else:
+            address = house["Address2"]
+
        searcher = SearchEpc(
-            address1=house["Address"],
-            postcode=house["Postcode"]
+            address1=address,
+            postcode=house["Postcode"],
+            auth_token=EPC_AUTH_TOKEN,
+            os_api_key=None
        )

-        response = searcher.search()
-        if response["status"] == 204:
-            nodata.append(house)
+        searcher.find_property(skip_os=True)
+
+        if searcher.newest_epc is None:
+            nodata.append(house["row_id"])
            continue

-        newest_epc, older_epcs, full_sap_epc = searcher.retrieve(
-            property_type=property_type_lookup.get(house["Property Type"], None),
-            address=house["Address"],
-        )
+        newest_epc = searcher.newest_epc
+        older_epcs = searcher.older_epcs
+        full_sap_epc = searcher.full_sap_epc

        eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
        eligibility.check_gbis_warmfront()
@ -273,7 +281,7 @@ def app():
    )
    cleaned = msgpack.unpackb(cleaned, raw=False)

-    cleaning_data = read_parquet_from_s3(
+    cleaning_data = read_dataframe_from_s3_parquet(
        bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
    )

--- a/etl/testing_data/estimate_epc.py
+++ b/etl/testing_data/estimate_epc.py
@ -73,7 +73,9 @@ def app():
        df["UPRN"] = df["UPRN"].astype("Int64").astype("str")
        df = df[~pd.isnull(df["UPRN"])]

-        uprn_sample = sample(df["UPRN"].unique().tolist(), DIR_SAMPLE_SIZE)
+        # uprn_sample = sample(df["UPRN"].unique().tolist(), DIR_SAMPLE_SIZE)
+        # Take a fixed sample based on the first DIR_SAMPLE_SIZE uprns
+        uprn_sample = sorted(df["UPRN"].unique().tolist())[:DIR_SAMPLE_SIZE]
        df_sample = df[df["UPRN"].isin(uprn_sample)]
        # Take the record with the newest LODGEMENT_DATETIME by uprn
        df_sample = df_sample.sort_values("LODGEMENT_DATETIME", ascending=False).drop_duplicates("UPRN")
@ -149,6 +151,8 @@ def app():
    # 0.7859617377809409
    # 0.5348837209302325

+    # Fixed sample, sqrt weights
+
    # Group by tenure
    by_tenure = results_df.groupby("tenure").agg(
        {"numeric_success": "median", "categorical_success": "median", "uprn": "count"}