making fixes to eligibility pipeline with updates to property class

This commit is contained in:
Khalim Conn-Kowlessar 2024-01-06 17:59:03 +00:00
parent 9c94123366
commit ac556d5507
4 changed files with 39 additions and 26 deletions

View file

@ -458,7 +458,7 @@ class SearchEpc:
if not epc_data.empty:
# Further processing of the EPC data
epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime'])
epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime'], format='mixed')
epc_data = epc_data.sort_values("lodgement-datetime", ascending=False).groupby("uprn").head(1)
epc_data["house_number"] = epc_data["address"].apply(lambda add1: self.get_house_number(add1))
epc_data["numeric_house_number"] = epc_data["house_number"].apply(
@ -646,7 +646,7 @@ class SearchEpc:
return agg[key].values[0]
def find_property(self):
def find_property(self, skip_os=False):
"""
This method will attempt to identify a property. It will, at first, use the EPC api to try and
find the EPC for the property and the associated UPRN. If this fails, it will use the Ordnance Survey API to
@ -669,6 +669,9 @@ class SearchEpc:
return
# Step 2: If we don't have an EPC, we use the ordnance survey api to find the uprn
if skip_os:
return
os_response = self.ordnance_survey_client.get_places_api()
if os_response["status"] != 200:

View file

@ -11,13 +11,12 @@ import numpy as np
import msgpack
from datetime import datetime, timedelta
from utils.logger import setup_logger
from utils.s3 import read_from_s3
from utils.s3 import read_from_s3, read_dataframe_from_s3_parquet
from dotenv import load_dotenv
from backend.SearchEpc import SearchEpc
from backend.Property import Property
from etl.eligibility.Eligibility import Eligibility
from etl.epc.DataProcessor import DataProcessor
from backend.app.utils import read_parquet_from_s3
from backend.app.plan.utils import create_recommendation_scoring_data
from etl.epc.settings import COLUMNS_TO_MERGE_ON
from backend.ml_models.api import ModelApi
@ -348,14 +347,13 @@ def prepare_model_data_row(
p = Property(
id=property_id,
postcode=modelling_epc["postcode"],
address1=modelling_epc["address1"],
epc_client=None,
data=modelling_epc
address=modelling_epc["address1"],
data=modelling_epc,
old_data=old_data,
full_sap_epc=full_sap_epc
)
p.old_data = old_data
p.full_sap_epc = full_sap_epc
p.get_components(cleaned)
p.get_components(cleaned, None, None)
# This is temp - this should happen after scoring
cleaned_property_data = DataProcessor.apply_averages_cleaning(
data_to_clean=pd.DataFrame([dict(**p.get_model_data(), LOCAL_AUTHORITY=p.data["local-authority"])]),
@ -1087,7 +1085,7 @@ def app():
)
cleaned = msgpack.unpackb(cleaned, raw=False)
cleaning_data = read_parquet_from_s3(
cleaning_data = read_dataframe_from_s3_parquet(
bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
)

View file

@ -1,3 +1,4 @@
import os
import msgpack
import openpyxl
from openpyxl.styles.colors import COLOR_INDEX
@ -5,10 +6,9 @@ from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np
from utils.s3 import read_from_s3
from utils.s3 import read_from_s3, read_dataframe_from_s3_parquet
from utils.logger import setup_logger
from dotenv import load_dotenv
from backend.app.utils import read_parquet_from_s3
from tqdm import tqdm
from backend.SearchEpc import SearchEpc
from etl.eligibility.Eligibility import Eligibility
@ -17,13 +17,14 @@ from etl.epc.DataProcessor import DataProcessor
from etl.epc.settings import COLUMNS_TO_MERGE_ON
from backend.ml_models.api import ModelApi
import re
ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
logger = setup_logger()
load_dotenv(ENV_FILE)
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
OS_API_KEY = os.getenv("ORDNANCE_SURVEY_API_KEY")
def load_data():
"""
@ -79,20 +80,27 @@ def get_ha7_data(data, cleaned, cleaning_data, created_at):
nodata = []
for _, house in tqdm(data.iterrows(), total=len(data)):
if house["Address"] is not None:
address = house["Address"]
else:
address = house["Address2"]
searcher = SearchEpc(
address1=house["Address"],
postcode=house["Postcode"]
address1=address,
postcode=house["Postcode"],
auth_token=EPC_AUTH_TOKEN,
os_api_key=None
)
response = searcher.search()
if response["status"] == 204:
nodata.append(house)
searcher.find_property(skip_os=True)
if searcher.newest_epc is None:
nodata.append(house["row_id"])
continue
newest_epc, older_epcs, full_sap_epc = searcher.retrieve(
property_type=property_type_lookup.get(house["Property Type"], None),
address=house["Address"],
)
newest_epc = searcher.newest_epc
older_epcs = searcher.older_epcs
full_sap_epc = searcher.full_sap_epc
eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
eligibility.check_gbis_warmfront()
@ -273,7 +281,7 @@ def app():
)
cleaned = msgpack.unpackb(cleaned, raw=False)
cleaning_data = read_parquet_from_s3(
cleaning_data = read_dataframe_from_s3_parquet(
bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
)

View file

@ -73,7 +73,9 @@ def app():
df["UPRN"] = df["UPRN"].astype("Int64").astype("str")
df = df[~pd.isnull(df["UPRN"])]
uprn_sample = sample(df["UPRN"].unique().tolist(), DIR_SAMPLE_SIZE)
# uprn_sample = sample(df["UPRN"].unique().tolist(), DIR_SAMPLE_SIZE)
# Take a fixed sample based on the first DIR_SAMPLE_SIZE uprns
uprn_sample = sorted(df["UPRN"].unique().tolist())[:DIR_SAMPLE_SIZE]
df_sample = df[df["UPRN"].isin(uprn_sample)]
# Take the record with the newest LODGEMENT_DATETIME by uprn
df_sample = df_sample.sort_values("LODGEMENT_DATETIME", ascending=False).drop_duplicates("UPRN")
@ -149,6 +151,8 @@ def app():
# 0.7859617377809409
# 0.5348837209302325
# Fixed sample, sqrt weights
# Group by tenure
by_tenure = results_df.groupby("tenure").agg(
{"numeric_success": "median", "categorical_success": "median", "uprn": "count"}