set up load data function for cancellation app

This commit is contained in:
Khalim Conn-Kowlessar 2024-01-11 11:57:44 +00:00
parent 1bb188a8b8
commit 7969f51733
9 changed files with 234 additions and 91 deletions

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyNamespacePackagesService">

2
.idea/misc.xml generated
View file

@ -3,7 +3,7 @@
<component name="Black">
<option name="sdkName" value="Python 3.10 (backend)" />
</component>
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
<component name="PythonCompatibilityInspectionAdvertiser">
<option name="version" value="3" />
</component>

View file

@ -358,9 +358,16 @@ def prepare_model_data_row(
p.get_components(cleaned, photo_supply_lookup=photo_supply_lookup,
floor_area_decile_thresholds=floor_area_decile_thresholds)
# THIS IS TEMP AND SHOULDN'T BE HERE
data_to_clean = p.get_model_data()
if data_to_clean["NUMBER_HEATED_ROOMS"] in ['', None]:
data_to_clean["NUMBER_HEATED_ROOMS"] = data_to_clean["NUMBER_HABITABLE_ROOMS"]
p.data["number-heated-rooms"] = data_to_clean["NUMBER_HABITABLE_ROOMS"]
# This is temp - this should happen after scoring
cleaned_property_data = DataProcessor.apply_averages_cleaning(
data_to_clean=pd.DataFrame([dict(**p.get_model_data(), LOCAL_AUTHORITY=p.data["local-authority"])]),
data_to_clean=pd.DataFrame([dict(**data_to_clean, LOCAL_AUTHORITY=p.data["local-authority"])]),
cleaning_data=cleaning_data,
cols_to_merge_on=['PROPERTY_TYPE', 'BUILT_FORM', 'CONSTRUCTION_AGE_BAND', 'LOCAL_AUTHORITY'],
)

View file

@ -18,6 +18,7 @@ from etl.epc.settings import COLUMNS_TO_MERGE_ON
from backend.ml_models.api import ModelApi
from etl.solar.SolarPhotoSupply import SolarPhotoSupply
from recommendations.recommendation_utils import calculate_cavity_age
from recommendation_utils import convert_thickness_to_numeric
import re
@ -484,9 +485,6 @@ def analyse_results(results_df, data, survey_list):
how="left", on="survey_key"
)
from recommendation_utils import convert_thickness_to_numeric
analysis_data["roof_insulation_thickness"] = analysis_data["roof_insulation_thickness"].fillna(None)
analysis_data["roof_insulation_thickness"] = np.where(
pd.isnull(analysis_data["roof_insulation_thickness"]), None, analysis_data["roof_insulation_thickness"]
)
@ -497,13 +495,12 @@ def analyse_results(results_df, data, survey_list):
warmfront_sold_eco4 = analysis_data[
(analysis_data["warmfront_identified"] == True) & (
analysis_data["funding_scheme"].isin(["ECO4 A/W", "AFFORDABLE WARMTH"]))
]
] # 1407
warmfront_sold_gbis = analysis_data[
(analysis_data["warmfront_identified"] == True) & (
analysis_data["funding_scheme"].isin(["ECO4 GBIS (ECO+)"]))
]
# 1407
ideal_eco4_warmfront_not_sold = analysis_data[
(analysis_data["eco4_eligible"] == True) & (analysis_data["warmfront_identified"] == False) & (
@ -519,7 +516,7 @@ def analyse_results(results_df, data, survey_list):
underperforming_cavities = analysis_data[
(analysis_data["eco4_message"] == "Failed due to full cavity - check cavity age") & (
analysis_data["cavity_age"] > 10 * 365
)
) & (analysis_data["roof_insulation_thickness_numeric"] <= 100)
]
identified_gbis_not_sold = analysis_data[
@ -643,7 +640,7 @@ def app():
# Read pickle
# import pickle
# with open("ha16_8_jan_2.pickle", "rb") as f:
# with open("ha16_10_jan.pickle", "rb") as f:
# saved = pickle.load(f)
# scoring_data = saved["scoring_data"]
# results_df = saved["results"]

View file

@ -17,6 +17,7 @@ from etl.epc.settings import COLUMNS_TO_MERGE_ON
from backend.ml_models.api import ModelApi
from etl.solar.SolarPhotoSupply import SolarPhotoSupply
from recommendations.recommendation_utils import calculate_cavity_age
from recommendation_utils import convert_thickness_to_numeric
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
@ -393,6 +394,42 @@ def analyse_results(results_df, data, survey_list):
how="left", on="survey_key"
)
# NEW
analysis_data["roof_insulation_thickness"] = np.where(
pd.isnull(analysis_data["roof_insulation_thickness"]), None, analysis_data["roof_insulation_thickness"]
)
analysis_data["roof_insulation_thickness_numeric"] = analysis_data["roof_insulation_thickness"].apply(
lambda x: convert_thickness_to_numeric(x, is_flat=False, is_pitched=True)
)
warmfront_sold_eco4 = analysis_data[
(analysis_data["warmfront_identified"] == True) & (
analysis_data["funding_scheme"].isin(["ECO4 A/W", "AFFORDABLE WARMTH"]))
]
warmfront_sold_gbis = analysis_data[
(analysis_data["warmfront_identified"] == True) & (
analysis_data["funding_scheme"].isin(["ECO4 GBIS (ECO+)"]))
]
# 1407
additional_eco4_warmfront_not_sold = analysis_data[
(analysis_data["eco4_eligible"] == True) & (analysis_data["warmfront_identified"] == False) & (
analysis_data["roof_insulation_thickness_numeric"] <= 100)
]
additional_gbis_warmfront_not_sold = analysis_data[
(analysis_data["gbis_eligible"] == True) & (analysis_data["warmfront_identified"] == False) & (
~analysis_data["row_id"].isin(additional_eco4_warmfront_not_sold["row_id"].values)
)
]
additional_gbis_warmfront_not_sold["walls"].value_counts()
analysis_data["walls"].value_counts()
# END NEW
all_identified_eco = analysis_data[
(analysis_data["warmfront_identified"] & analysis_data["funding_scheme"].isin(
["ECO4 A/W"])) |
@ -480,7 +517,7 @@ def app():
# Read in pickle
# import pickle
# with open("ha24_8_jan.pickle", "rb") as f:
# with open("ha24_10_jan.pickle", "rb") as f:
# saved = pickle.load(f)
# scoring_data = saved["scoring_data"]
# results_df = saved["results"]

View file

@ -17,6 +17,8 @@ from etl.epc.DataProcessor import DataProcessor
from etl.epc.settings import COLUMNS_TO_MERGE_ON
from backend.ml_models.api import ModelApi
from etl.solar.SolarPhotoSupply import SolarPhotoSupply
from recommendations.recommendation_utils import calculate_cavity_age
from recommendation_utils import convert_thickness_to_numeric
import re
@ -341,7 +343,7 @@ def get_epc_data(data, cleaned, cleaning_data, created_at, photo_supply_lookup,
"Guest Room": {"property-type": None, "built-form": None}
}
for _, property_meta in tqdm(data.iterrows(), total=len(data)):
for _, property_meta in tqdm(data, total=len(data)):
searcher = SearchEpc(
address1=property_meta["HouseNo"],
@ -368,22 +370,35 @@ def get_epc_data(data, cleaned, cleaning_data, created_at, photo_supply_lookup,
older_epcs = searcher.older_epcs
full_sap_epc = searcher.full_sap_epc
# We also want to get the penultimate epc
penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
if not penultimate_epc:
penultimate_epc = newest_epc
# penultimate_epc, _ = searcher.filter_newest_epc(older_epcs)
# if not penultimate_epc:
# penultimate_epc = newest_epc
eligibility = Eligibility(epc=newest_epc, cleaned=cleaned)
eligibility.check_gbis_warmfront()
eligibility.check_eco4_warmfront()
if (not eligibility.eco4_warmfront["eligible"]) and (not eligibility.gbis_warmfront):
eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
eligibility.check_gbis_warmfront()
eligibility.check_eco4_warmfront()
# If this is the case, we need to update the older epcs
# We don't update just to make data cleaning easier
if penultimate_epc.get("estimated") is None:
older_epcs = [x for x in searcher.data["rows"] if x["lmk-key"] != penultimate_epc["lmk-key"]]
# if (not eligibility.eco4_warmfront["eligible"]) and (not eligibility.gbis_warmfront):
# eligibility = Eligibility(epc=penultimate_epc, cleaned=cleaned)
# eligibility.check_gbis_warmfront()
# eligibility.check_eco4_warmfront()
# # If this is the case, we need to update the older epcs
# # We don't update just to make data cleaning easier
# if penultimate_epc.get("estimated") is None:
# older_epcs = [x for x in searcher.data["rows"] if x["lmk-key"] != penultimate_epc["lmk-key"]]
# If the property is a cavity wall and it's filled, we produce an estimate for the age of the cavity
# Loft MUST be suitable
cavity_age = None
if (
eligibility.walls["is_cavity_wall"] and
eligibility.walls["is_filled_cavity"] and
eligibility.loft["suitability"] and
eligibility.eco4_warmfront["message"] == "Failed due to full cavity - check cavity age"
):
# We check the age of the cavity and if it's particularly old, we flag it
cavity_age = calculate_cavity_age(newest_epc, older_epcs, cleaned)
# Full checks
eligibility.check_gbis()
@ -396,6 +411,15 @@ def get_epc_data(data, cleaned, cleaning_data, created_at, photo_supply_lookup,
if eligibility.epc["construction-age-band"] in ["", None]:
eligibility.epc["construction-age-band"] = map_year_to_age_band(property_meta["Build Yr"])
# This is not the right place to do this but this is temp
if eligibility.epc["extension-count"] in ["", None]:
eligibility.epc["extension-count"] = 0
# Not in the right place but temp
if eligibility.epc["built-form"] in ["", None]:
if not older_epcs:
eligibility.epc["built-form"] = "Mid-Terrace"
scoring_dictionary = prepare_model_data_row(
property_id=property_meta["row_id"],
modelling_epc=eligibility.epc,
@ -431,6 +455,9 @@ def get_epc_data(data, cleaned, cleaning_data, created_at, photo_supply_lookup,
"heating": eligibility.epc["mainheat-description"],
"tenure": eligibility.tenure,
"date_epc": eligibility.epc["lodgement-date"],
"cavity_age": cavity_age,
**eligibility.walls,
**eligibility.roof,
}
)
@ -657,6 +684,8 @@ def get_epc_data_for_lost_surveys(
"heating": eligibility.epc["mainheat-description"],
"tenure": eligibility.tenure,
"date_epc": eligibility.epc["lodgement-date"],
**eligibility.walls,
**eligibility.roof,
}
)
@ -758,58 +787,51 @@ def analyse_results(results_df, data, eco4_prospects_survey_list):
results_df, how="left", on="row_id"
)
warmfront_identified = analysis_data[analysis_data["warmfront_identified"]]
# NEW
analysis_data["roof_insulation_thickness"] = np.where(
pd.isnull(analysis_data["roof_insulation_thickness"]), None, analysis_data["roof_insulation_thickness"]
)
analysis_data["roof_insulation_thickness_numeric"] = analysis_data["roof_insulation_thickness"].apply(
lambda x: convert_thickness_to_numeric(x, is_flat=False, is_pitched=True)
)
identified_eco = analysis_data[analysis_data["eco4_eligible"] == True]
identified_eco = identified_eco[identified_eco["eco4_message"] == "subject to post retrofit sap"]
warmfront_identified = analysis_data[
(analysis_data["warmfront_identified"] == True)
] # 2204
identified_gbis = analysis_data[
(analysis_data["gbis_eligible"] == True) & (analysis_data["eco4_eligible"] == False)
# Because we don't know which property is for which scheme, we'll just look at what we found
ideal_eco4 = analysis_data[
(analysis_data["eco4_eligible"] == True) &
(analysis_data["roof_insulation_thickness_numeric"] <= 100) &
(analysis_data["sap"] <= 54)
] # 335
gbis = analysis_data[
(analysis_data["gbis_eligible"] == True) &
~analysis_data["row_id"].isin(ideal_eco4["row_id"].values)
]
# Take just unfilled cavities and remove filled potentials
identified_gbis["walls"].value_counts()
identified_gbis["walls"].value_counts()
# Of the ECO jobs, what proportion to we get right
success_rate = (warmfront_identified["eco4_eligible"] | warmfront_identified["gbis_eligible"]).sum() / \
warmfront_identified.shape[
0]
# No gbis for this
# gbis_success_rate = warmfront_identified_gbis["gbis_eligible"].sum() / warmfront_identified_gbis.shape[0]
# Additional identified
additional_identified_eco = analysis_data[
(analysis_data["eco4_eligible"] == True) & (analysis_data["warmfront_identified"] == False)
]
additional_identified_eco["eligibility_classification"].value_counts()
additional_identified_gbis = analysis_data[
(analysis_data["gbis_eligible"] == True) & (analysis_data["eco4_eligible"] == False) & (
analysis_data["warmfront_identified"] == False
)
].shape[0]
# Future
additional_identified_eco_future = analysis_data[
(analysis_data["eco4_eligible_future"] == True) & (analysis_data["warmfront_identified"] == False)
].shape[0]
additional_identified_gbis_future = analysis_data[
(analysis_data["gbis_eligible_future"] == True) & (analysis_data["eco4_eligible_future"] == False) & (
analysis_data["warmfront_identified"] == False
)
].shape[0]
ideal_eco4 = ideal_eco4[ideal_eco4["sap"] <= 54]
def analyse_lost_surveys(results_df):
identified_eco = results_df[results_df["eco4_eligible"] == True]
# 59 for lost surveys
identified_gbis = results_df[results_df["gbis_eligible"] == True]
# 107
results_df["roof_insulation_thickness"] = np.where(
pd.isnull(results_df["roof_insulation_thickness"]), None, results_df["roof_insulation_thickness"]
)
results_df["roof_insulation_thickness_numeric"] = results_df["roof_insulation_thickness"].apply(
lambda x: convert_thickness_to_numeric(x, is_flat=False, is_pitched=True)
)
ideal_eco4 = results_df[
(results_df["eco4_eligible"] == True) &
(results_df["roof_insulation_thickness_numeric"] <= 100) &
(results_df["sap"] <= 54)
] # 25
gbis = results_df[
(results_df["gbis_eligible"] == True) &
~results_df["row_id"].isin(ideal_eco4["row_id"].values)
] # 82
def app():
@ -837,7 +859,7 @@ def app():
# Pickle the outputs
# Old data was ha25.pickle
# import pickle
# with open("ha25_9_jan.pickle", "wb") as f:
# with open("ha25_10_jan.pickle", "wb") as f:
# pickle.dump(
# {
# "results_df": results_df,
@ -848,9 +870,9 @@ def app():
# )
# Load in pickle
# import pickle
# with open("ha25_9_jan.pickle", "rb") as f:
# saved = pickle.load(f)
# results_df = saved["results_df"]
# scoring_data = saved["scoring_data"]
# nodata = saved["nodata"]
import pickle
with open("ha25_10_jan.pickle", "rb") as f:
saved = pickle.load(f)
results_df = saved["results_df"]
scoring_data = saved["scoring_data"]
nodata = saved["nodata"]

View file

@ -1,3 +1,4 @@
import os
import msgpack
from pathlib import Path
from datetime import datetime
@ -6,7 +7,7 @@ import pandas as pd
from utils.s3 import read_from_s3
from utils.logger import setup_logger
from dotenv import load_dotenv
from backend.app.utils import read_parquet_from_s3
from utils.s3 import read_dataframe_from_s3_parquet
from tqdm import tqdm
from backend.SearchEpc import SearchEpc
from etl.eligibility.Eligibility import Eligibility
@ -14,9 +15,13 @@ from etl.eligibility.ha_15_32.app import prepare_model_data_row
from etl.epc.DataProcessor import DataProcessor
from etl.epc.settings import COLUMNS_TO_MERGE_ON
from backend.ml_models.api import ModelApi
from etl.solar.SolarPhotoSupply import SolarPhotoSupply
from recommendations.recommendation_utils import calculate_cavity_age
from recommendation_utils import convert_thickness_to_numeric
import re
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
logger = setup_logger()
@ -52,7 +57,7 @@ def standardise_ha_4(data):
return data
def get_ha_4_data(data, cleaned, cleaning_data, created_at):
def get_ha_4_data(data, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds):
scoring_data = []
results = []
nodata = []
@ -62,19 +67,33 @@ def get_ha_4_data(data, cleaned, cleaning_data, created_at):
searcher = SearchEpc(
address1=property_meta["Address Line 1"],
postcode=property_meta["Post Code"],
size=1000
auth_token=EPC_AUTH_TOKEN,
os_api_key=None,
property_type=property_type_lookup.get(house["Archetype"]),
)
searcher.search()
searcher.find_property(skip_os=True)
if searcher.data is None:
if searcher.newest_epc is None:
searcher = SearchEpc(
address1=property_meta["Location Name"],
postcode=property_meta["Post Code"],
size=1000
auth_token=EPC_AUTH_TOKEN,
os_api_key=None,
property_type=property_type_lookup.get(house["Archetype"]),
)
searcher.search()
if searcher.newest_epc is None:
nodata.append(house["row_id"])
continue
newest_epc = searcher.newest_epc
older_epcs = searcher.older_epcs
full_sap_epc = searcher.full_sap_epc
searcher.search()
if searcher.data is None:
nodata.append(property_meta.to_dict())
continue
@ -273,17 +292,21 @@ def app():
)
cleaned = msgpack.unpackb(cleaned, raw=False)
cleaning_data = read_parquet_from_s3(
cleaning_data = read_dataframe_from_s3_parquet(
bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
)
created_at = datetime.now().isoformat()
photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev")
results_df, scoring_data, nodata = get_ha_4_data(
data=data,
cleaned=cleaned,
cleaning_data=cleaning_data,
created_at=created_at
created_at=created_at,
photo_supply_lookup=photo_supply_lookup,
floor_area_decile_thresholds=floor_area_decile_thresholds
)
# Store the data locally as a pickle

View file

@ -17,6 +17,8 @@ from etl.epc.DataProcessor import DataProcessor
from etl.epc.settings import COLUMNS_TO_MERGE_ON
from backend.ml_models.api import ModelApi
from etl.solar.SolarPhotoSupply import SolarPhotoSupply
from recommendations.recommendation_utils import calculate_cavity_age
from recommendation_utils import convert_thickness_to_numeric
ENV_FILE = Path(__file__).parent / "etl" / "eligibility" / "ha_15_32" / ".env"
@ -112,6 +114,19 @@ def get_ha7_data(data, cleaned, cleaning_data, created_at, photo_supply_lookup,
eligibility.check_gbis_warmfront()
eligibility.check_eco4_warmfront()
# If the property is a cavity wall and it's filled, we produce an estimate for the age of the cavity
# Loft MUST be suitable
cavity_age = None
if (
eligibility.walls["is_cavity_wall"] and
eligibility.walls["is_filled_cavity"] and
eligibility.loft["suitability"] and
eligibility.eco4_warmfront["message"] == "Failed due to full cavity - check cavity age"
):
# We check the age of the cavity and if it's particularly old, we flag it
cavity_age = calculate_cavity_age(newest_epc, older_epcs, cleaned)
# If the house is not identified, we do a full gbis and eco4 check
eligibility.check_gbis()
eligibility.check_eco4()
@ -151,6 +166,9 @@ def get_ha7_data(data, cleaned, cleaning_data, created_at, photo_supply_lookup,
"tenure": eligibility.tenure,
"date_epc": eligibility.epc["lodgement-date"],
**newest_epc,
"cavity_age": cavity_age,
**eligibility.walls,
**eligibility.roof,
}
)
@ -250,21 +268,56 @@ def get_ha7_data(data, cleaned, cleaning_data, created_at, photo_supply_lookup,
def analyse_ha_7(results_df, data):
df = results_df.merge(
analysis_data = results_df.merge(
data[["row_id", "row_code", "Property Type", "Construction Year Band"]], how="left", on="row_id"
)
warmfront_identification = df["row_code"].value_counts()
warmfront_identified = df[df["row_code"] == "potential ECO4"]
# NEW
analysis_data["roof_insulation_thickness"] = np.where(
pd.isnull(analysis_data["roof_insulation_thickness"]), None, analysis_data["roof_insulation_thickness"]
)
analysis_data["roof_insulation_thickness_numeric"] = analysis_data["roof_insulation_thickness"].apply(
lambda x: convert_thickness_to_numeric(x, is_flat=False, is_pitched=True)
)
ideal_eco4 = analysis_data[
(analysis_data["eco4_eligible"] == True) & (
analysis_data["roof_insulation_thickness_numeric"] <= 100)
]
secondary_eco4_warmfront_not_sold = analysis_data[
(analysis_data["eco4_eligible"] == True) & (
analysis_data["roof_insulation_thickness_numeric"] > 100)
]
# underperforming cavities
underperforming_cavities = analysis_data[
(analysis_data["eco4_message"] == "Failed due to full cavity - check cavity age") & (
analysis_data["cavity_age"] > 9 * 365
) & (analysis_data["roof_insulation_thickness_numeric"] <= 100)
]
identified_gbis_not_sold = analysis_data[
(analysis_data["gbis_eligible"] == True) & (
analysis_data["eco4_eligible"] == False
)
]
# END NEW
warmfront_identification = analysis_data["row_code"].value_counts()
warmfront_identified = analysis_data[analysis_data["row_code"] == "potential ECO4"]
warmfront_identified["walls"].value_counts(normalize=True)
df["Construction Year Band"].value_counts(normalize=True)
analysis_data["Construction Year Band"].value_counts(normalize=True)
# Number of days from today
days_to_today = (datetime.now() - pd.to_datetime(warmfront_identified["date_epc"])).dt.days
days_to_today.mean()
property_types = df["Property Type"].value_counts()
property_types = analysis_data["Property Type"].value_counts()
n_identified = (results_df["gbis_eligible"] | results_df["eco4_eligible"]).sum()
@ -312,12 +365,12 @@ def app():
# Pickle results
# import pickle
# with open("ha7_results.pkl", "wb") as f:
# with open("ha7_results_jan_10.pkl", "wb") as f:
# pickle.dump({"results_df": results_df, "scoring_data": scoring_data, "nodata": nodata}, f)
# Read in the old data
# import pickle
# with open("ha7_results.pkl", "rb") as f:
# with open("ha7_results_jan_10.pkl", "rb") as f:
# old_data = pickle.load(f)
# results_df = old_data["results_df"]
# scoring_data = old_data["scoring_data"]

View file

@ -176,12 +176,16 @@ class Costs:
"""
material_cost_per_m2 = material["material_cost"]
# We inflate material costs due to recent price increases
material_cost_per_m2 = material_cost_per_m2 * 1.5
base_material_cost = material_cost_per_m2 * floor_area
labour_cost = material["labour_cost"] * floor_area * self.labour_adjustment_factor
subtotal_before_profit = base_material_cost + labour_cost
contingency_cost = subtotal_before_profit * self.CONTINGENCY
# We use high risk contingency because of the possibility of access issues and clearing existing insulation
contingency_cost = subtotal_before_profit * self.HIGH_RISK_CONTINGENCY
preliminaries_cost = subtotal_before_profit * self.PRELIMINARIES
profit_cost = subtotal_before_profit * self.PROFIT_MARGIN