diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index b3f58b04..2a2cdfba 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -458,7 +458,7 @@ class SearchEpc: if not epc_data.empty: # Further processing of the EPC data - epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime']) + epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime'], format='mixed') epc_data = epc_data.sort_values("lodgement-datetime", ascending=False).groupby("uprn").head(1) epc_data["house_number"] = epc_data["address"].apply(lambda add1: self.get_house_number(add1)) epc_data["numeric_house_number"] = epc_data["house_number"].apply( @@ -646,7 +646,7 @@ class SearchEpc: return agg[key].values[0] - def find_property(self): + def find_property(self, skip_os=False): """ This method will attempt to identify a property. It will, at first, use the EPC api to try and find the EPC for the property and the associated UPRN. If this fails, it will use the Ordnance Survey API to @@ -669,6 +669,9 @@ class SearchEpc: return # Step 2: If we don't have an EPC, we use the ordnance survey api to find the uprn + if skip_os: + return + os_response = self.ordnance_survey_client.get_places_api() if os_response["status"] != 200: diff --git a/etl/eligibility/ha_15_32/app.py b/etl/eligibility/ha_15_32/app.py index 48bfeb2c..9a563770 100644 --- a/etl/eligibility/ha_15_32/app.py +++ b/etl/eligibility/ha_15_32/app.py @@ -11,13 +11,12 @@ import numpy as np import msgpack from datetime import datetime, timedelta from utils.logger import setup_logger -from utils.s3 import read_from_s3 +from utils.s3 import read_from_s3, read_dataframe_from_s3_parquet from dotenv import load_dotenv from backend.SearchEpc import SearchEpc from backend.Property import Property from etl.eligibility.Eligibility import Eligibility from etl.epc.DataProcessor import DataProcessor -from backend.app.utils import read_parquet_from_s3 from backend.app.plan.utils import create_recommendation_scoring_data from etl.epc.settings import COLUMNS_TO_MERGE_ON from backend.ml_models.api import ModelApi @@ -348,14 +347,13 @@ def prepare_model_data_row( p = Property( id=property_id, postcode=modelling_epc["postcode"], - address1=modelling_epc["address1"], - epc_client=None, - data=modelling_epc + address=modelling_epc["address1"], + data=modelling_epc, + old_data=old_data, + full_sap_epc=full_sap_epc ) - p.old_data = old_data - p.full_sap_epc = full_sap_epc - p.get_components(cleaned) + p.get_components(cleaned, None, None) # This is temp - this should happen after scoring cleaned_property_data = DataProcessor.apply_averages_cleaning( data_to_clean=pd.DataFrame([dict(**p.get_model_data(), LOCAL_AUTHORITY=p.data["local-authority"])]), @@ -1087,7 +1085,7 @@ def app(): ) cleaned = msgpack.unpackb(cleaned, raw=False) - cleaning_data = read_parquet_from_s3( + cleaning_data = read_dataframe_from_s3_parquet( bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", ) diff --git a/etl/eligibility/ha_15_32/ha7_app.py b/etl/eligibility/ha_15_32/ha7_app.py index 7d856366..0152ab91 100644 --- a/etl/eligibility/ha_15_32/ha7_app.py +++ b/etl/eligibility/ha_15_32/ha7_app.py @@ -1,3 +1,4 @@ +import os import msgpack import openpyxl from openpyxl.styles.colors import COLOR_INDEX @@ -5,10 +6,9 @@ from pathlib import Path from datetime import datetime import pandas as pd import numpy as np -from utils.s3 import read_from_s3 +from utils.s3 import read_from_s3, read_dataframe_from_s3_parquet from utils.logger import setup_logger from dotenv import load_dotenv -from backend.app.utils import read_parquet_from_s3 from tqdm import tqdm from backend.SearchEpc import SearchEpc from etl.eligibility.Eligibility import Eligibility @@ -17,13 +17,14 @@ from etl.epc.DataProcessor import DataProcessor from etl.epc.settings import COLUMNS_TO_MERGE_ON from backend.ml_models.api import ModelApi -import re - ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env" logger = setup_logger() load_dotenv(ENV_FILE) +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +OS_API_KEY = os.getenv("ORDNANCE_SURVEY_API_KEY") + def load_data(): """ @@ -79,20 +80,27 @@ def get_ha7_data(data, cleaned, cleaning_data, created_at): nodata = [] for _, house in tqdm(data.iterrows(), total=len(data)): + if house["Address"] is not None: + address = house["Address"] + else: + address = house["Address2"] + searcher = SearchEpc( - address1=house["Address"], - postcode=house["Postcode"] + address1=address, + postcode=house["Postcode"], + auth_token=EPC_AUTH_TOKEN, + os_api_key=None ) - response = searcher.search() - if response["status"] == 204: - nodata.append(house) + searcher.find_property(skip_os=True) + + if searcher.newest_epc is None: + nodata.append(house["row_id"]) continue - newest_epc, older_epcs, full_sap_epc = searcher.retrieve( - property_type=property_type_lookup.get(house["Property Type"], None), - address=house["Address"], - ) + newest_epc = searcher.newest_epc + older_epcs = searcher.older_epcs + full_sap_epc = searcher.full_sap_epc eligibility = Eligibility(epc=newest_epc, cleaned=cleaned) eligibility.check_gbis_warmfront() @@ -273,7 +281,7 @@ def app(): ) cleaned = msgpack.unpackb(cleaned, raw=False) - cleaning_data = read_parquet_from_s3( + cleaning_data = read_dataframe_from_s3_parquet( bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet", ) diff --git a/etl/testing_data/estimate_epc.py b/etl/testing_data/estimate_epc.py index 9e460678..cd91a540 100644 --- a/etl/testing_data/estimate_epc.py +++ b/etl/testing_data/estimate_epc.py @@ -73,7 +73,9 @@ def app(): df["UPRN"] = df["UPRN"].astype("Int64").astype("str") df = df[~pd.isnull(df["UPRN"])] - uprn_sample = sample(df["UPRN"].unique().tolist(), DIR_SAMPLE_SIZE) + # uprn_sample = sample(df["UPRN"].unique().tolist(), DIR_SAMPLE_SIZE) + # Take a fixed sample based on the first DIR_SAMPLE_SIZE uprns + uprn_sample = sorted(df["UPRN"].unique().tolist())[:DIR_SAMPLE_SIZE] df_sample = df[df["UPRN"].isin(uprn_sample)] # Take the record with the newest LODGEMENT_DATETIME by uprn df_sample = df_sample.sort_values("LODGEMENT_DATETIME", ascending=False).drop_duplicates("UPRN") @@ -149,6 +151,8 @@ def app(): # 0.7859617377809409 # 0.5348837209302325 + # Fixed sample, sqrt weights + # Group by tenure by_tenure = results_df.groupby("tenure").agg( {"numeric_success": "median", "categorical_success": "median", "uprn": "count"}