mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
making fixes to eligibility pipeline with updates to property class
This commit is contained in:
parent
9c94123366
commit
ac556d5507
4 changed files with 39 additions and 26 deletions
|
|
@ -458,7 +458,7 @@ class SearchEpc:
|
|||
|
||||
if not epc_data.empty:
|
||||
# Further processing of the EPC data
|
||||
epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime'])
|
||||
epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime'], format='mixed')
|
||||
epc_data = epc_data.sort_values("lodgement-datetime", ascending=False).groupby("uprn").head(1)
|
||||
epc_data["house_number"] = epc_data["address"].apply(lambda add1: self.get_house_number(add1))
|
||||
epc_data["numeric_house_number"] = epc_data["house_number"].apply(
|
||||
|
|
@ -646,7 +646,7 @@ class SearchEpc:
|
|||
|
||||
return agg[key].values[0]
|
||||
|
||||
def find_property(self):
|
||||
def find_property(self, skip_os=False):
|
||||
"""
|
||||
This method will attempt to identify a property. It will, at first, use the EPC api to try and
|
||||
find the EPC for the property and the associated UPRN. If this fails, it will use the Ordnance Survey API to
|
||||
|
|
@ -669,6 +669,9 @@ class SearchEpc:
|
|||
return
|
||||
|
||||
# Step 2: If we don't have an EPC, we use the ordnance survey api to find the uprn
|
||||
if skip_os:
|
||||
return
|
||||
|
||||
os_response = self.ordnance_survey_client.get_places_api()
|
||||
|
||||
if os_response["status"] != 200:
|
||||
|
|
|
|||
|
|
@ -11,13 +11,12 @@ import numpy as np
|
|||
import msgpack
|
||||
from datetime import datetime, timedelta
|
||||
from utils.logger import setup_logger
|
||||
from utils.s3 import read_from_s3
|
||||
from utils.s3 import read_from_s3, read_dataframe_from_s3_parquet
|
||||
from dotenv import load_dotenv
|
||||
from backend.SearchEpc import SearchEpc
|
||||
from backend.Property import Property
|
||||
from etl.eligibility.Eligibility import Eligibility
|
||||
from etl.epc.DataProcessor import DataProcessor
|
||||
from backend.app.utils import read_parquet_from_s3
|
||||
from backend.app.plan.utils import create_recommendation_scoring_data
|
||||
from etl.epc.settings import COLUMNS_TO_MERGE_ON
|
||||
from backend.ml_models.api import ModelApi
|
||||
|
|
@ -348,14 +347,13 @@ def prepare_model_data_row(
|
|||
p = Property(
|
||||
id=property_id,
|
||||
postcode=modelling_epc["postcode"],
|
||||
address1=modelling_epc["address1"],
|
||||
epc_client=None,
|
||||
data=modelling_epc
|
||||
address=modelling_epc["address1"],
|
||||
data=modelling_epc,
|
||||
old_data=old_data,
|
||||
full_sap_epc=full_sap_epc
|
||||
)
|
||||
p.old_data = old_data
|
||||
p.full_sap_epc = full_sap_epc
|
||||
|
||||
p.get_components(cleaned)
|
||||
p.get_components(cleaned, None, None)
|
||||
# This is temp - this should happen after scoring
|
||||
cleaned_property_data = DataProcessor.apply_averages_cleaning(
|
||||
data_to_clean=pd.DataFrame([dict(**p.get_model_data(), LOCAL_AUTHORITY=p.data["local-authority"])]),
|
||||
|
|
@ -1087,7 +1085,7 @@ def app():
|
|||
)
|
||||
cleaned = msgpack.unpackb(cleaned, raw=False)
|
||||
|
||||
cleaning_data = read_parquet_from_s3(
|
||||
cleaning_data = read_dataframe_from_s3_parquet(
|
||||
bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
import os
|
||||
import msgpack
|
||||
import openpyxl
|
||||
from openpyxl.styles.colors import COLOR_INDEX
|
||||
|
|
@ -5,10 +6,9 @@ from pathlib import Path
|
|||
from datetime import datetime
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from utils.s3 import read_from_s3
|
||||
from utils.s3 import read_from_s3, read_dataframe_from_s3_parquet
|
||||
from utils.logger import setup_logger
|
||||
from dotenv import load_dotenv
|
||||
from backend.app.utils import read_parquet_from_s3
|
||||
from tqdm import tqdm
|
||||
from backend.SearchEpc import SearchEpc
|
||||
from etl.eligibility.Eligibility import Eligibility
|
||||
|
|
@ -17,13 +17,14 @@ from etl.epc.DataProcessor import DataProcessor
|
|||
from etl.epc.settings import COLUMNS_TO_MERGE_ON
|
||||
from backend.ml_models.api import ModelApi
|
||||
|
||||
import re
|
||||
|
||||
ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
|
||||
|
||||
logger = setup_logger()
|
||||
load_dotenv(ENV_FILE)
|
||||
|
||||
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
|
||||
OS_API_KEY = os.getenv("ORDNANCE_SURVEY_API_KEY")
|
||||
|
||||
|
||||
def load_data():
|
||||
"""
|
||||
|
|
@ -79,20 +80,27 @@ def get_ha7_data(data, cleaned, cleaning_data, created_at):
|
|||
nodata = []
|
||||
for _, house in tqdm(data.iterrows(), total=len(data)):
|
||||
|
||||
if house["Address"] is not None:
|
||||
address = house["Address"]
|
||||
else:
|
||||
address = house["Address2"]
|
||||
|
||||
searcher = SearchEpc(
|
||||
address1=house["Address"],
|
||||
postcode=house["Postcode"]
|
||||
address1=address,
|
||||
postcode=house["Postcode"],
|
||||
auth_token=EPC_AUTH_TOKEN,
|
||||
os_api_key=None
|
||||
)
|
||||
|
||||
response = searcher.search()
|
||||
if response["status"] == 204:
|
||||
nodata.append(house)
|
||||
searcher.find_property(skip_os=True)
|
||||
|
||||
if searcher.newest_epc is None:
|
||||
nodata.append(house["row_id"])
|
||||
continue
|
||||
|
||||
newest_epc, older_epcs, full_sap_epc = searcher.retrieve(
|
||||
property_type=property_type_lookup.get(house["Property Type"], None),
|
||||
address=house["Address"],
|
||||
)
|
||||
newest_epc = searcher.newest_epc
|
||||
older_epcs = searcher.older_epcs
|
||||
full_sap_epc = searcher.full_sap_epc
|
||||
|
||||
eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
|
||||
eligibility.check_gbis_warmfront()
|
||||
|
|
@ -273,7 +281,7 @@ def app():
|
|||
)
|
||||
cleaned = msgpack.unpackb(cleaned, raw=False)
|
||||
|
||||
cleaning_data = read_parquet_from_s3(
|
||||
cleaning_data = read_dataframe_from_s3_parquet(
|
||||
bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
|
||||
)
|
||||
|
||||
|
|
|
|||
|
|
@ -73,7 +73,9 @@ def app():
|
|||
df["UPRN"] = df["UPRN"].astype("Int64").astype("str")
|
||||
df = df[~pd.isnull(df["UPRN"])]
|
||||
|
||||
uprn_sample = sample(df["UPRN"].unique().tolist(), DIR_SAMPLE_SIZE)
|
||||
# uprn_sample = sample(df["UPRN"].unique().tolist(), DIR_SAMPLE_SIZE)
|
||||
# Take a fixed sample based on the first DIR_SAMPLE_SIZE uprns
|
||||
uprn_sample = sorted(df["UPRN"].unique().tolist())[:DIR_SAMPLE_SIZE]
|
||||
df_sample = df[df["UPRN"].isin(uprn_sample)]
|
||||
# Take the record with the newest LODGEMENT_DATETIME by uprn
|
||||
df_sample = df_sample.sort_values("LODGEMENT_DATETIME", ascending=False).drop_duplicates("UPRN")
|
||||
|
|
@ -149,6 +151,8 @@ def app():
|
|||
# 0.7859617377809409
|
||||
# 0.5348837209302325
|
||||
|
||||
# Fixed sample, sqrt weights
|
||||
|
||||
# Group by tenure
|
||||
by_tenure = results_df.groupby("tenure").agg(
|
||||
{"numeric_success": "median", "categorical_success": "median", "uprn": "count"}
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue