mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Added booleans to clean missings
This commit is contained in:
parent
edb541f3dc
commit
ef27d6b164
6 changed files with 288 additions and 12 deletions
|
|
@ -45,7 +45,9 @@ class Definitions:
|
|||
# contain a ‘null’ value. A resolution to correct these anomalies will be considered for future data releases.
|
||||
"NULL",
|
||||
# We sometimes see fields populated with just an empty string.
|
||||
""
|
||||
"",
|
||||
# An older value which rarely shows up but has been seen in the data.
|
||||
"UNKNOWN",
|
||||
}
|
||||
|
||||
DATA_ANOMALY_SUBSTRINGS = {
|
||||
|
|
|
|||
|
|
@ -13,7 +13,7 @@ from etl.epc_clean.epc_attributes.all_cleaners import all_cleaner_map
|
|||
from etl.solar.SolarPhotoSupply import SolarPhotoSupply
|
||||
from utils.logger import setup_logger
|
||||
from utils.s3 import read_dataframe_from_s3_parquet
|
||||
from BaseUtility import Definitions
|
||||
from etl.epc.settings import DATA_ANOMALY_MATCHES
|
||||
from recommendations.rdsap_tables import england_wales_age_band_lookup, FLOOR_LEVEL_MAP
|
||||
from recommendations.recommendation_utils import (
|
||||
estimate_perimeter, get_wall_type, estimate_external_wall_area, esimtate_pitched_roof_area, estimate_windows
|
||||
|
|
@ -25,7 +25,7 @@ DATA_BUCKET = os.environ.get('DATA_BUCKET', 'retrofit-data-dev' if ENVIRONMENT =
|
|||
logger = setup_logger()
|
||||
|
||||
|
||||
class Property(Definitions):
|
||||
class Property:
|
||||
ATTRIBUTE_MAP = {
|
||||
"floor-description": "floor",
|
||||
"hotwater-description": "hotwater",
|
||||
|
|
@ -51,6 +51,8 @@ class Property(Definitions):
|
|||
spatial = None
|
||||
base_difference_record = None
|
||||
|
||||
DATA_ANOMALY_MATCHES = DATA_ANOMALY_MATCHES
|
||||
|
||||
def __init__(self, id, postcode, address, epc_record):
|
||||
|
||||
self.epc_record = epc_record
|
||||
|
|
@ -302,6 +304,7 @@ class Property(Definitions):
|
|||
self.set_basic_property_dimensions()
|
||||
|
||||
for description, attribute in cleaned.items():
|
||||
|
||||
if self.data[description] in self.DATA_ANOMALY_MATCHES:
|
||||
template = cleaned[description][0]
|
||||
fill_dict = dict(zip(template.keys(), [None] * len(template)))
|
||||
|
|
@ -319,7 +322,7 @@ class Property(Definitions):
|
|||
attributes = [
|
||||
x for x in cleaned[description] if x["original_description"] == self.data[description]
|
||||
]
|
||||
|
||||
|
||||
if len(attributes) > 1:
|
||||
raise ValueError("Either No attributes or multiple found for %s" % description)
|
||||
|
||||
|
|
|
|||
|
|
@ -233,6 +233,13 @@ class Eligibility:
|
|||
def room_roof_insulation(self):
|
||||
is_room_roof = self.roof["is_roof_room"]
|
||||
|
||||
if not is_room_roof:
|
||||
self.room_roof = {
|
||||
"suitability": False,
|
||||
"thickness": None
|
||||
}
|
||||
return
|
||||
|
||||
insulation_thickness = convert_thickness_to_numeric(
|
||||
self.roof["insulation_thickness"],
|
||||
self.roof["is_pitched"],
|
||||
|
|
@ -246,6 +253,14 @@ class Eligibility:
|
|||
|
||||
def flat_roof_insulation(self):
|
||||
is_flat = self.roof["is_flat"]
|
||||
|
||||
if not is_flat:
|
||||
self.flat_roof = {
|
||||
"suitability": False,
|
||||
"thickness": None
|
||||
}
|
||||
return
|
||||
|
||||
insulation_thickness = convert_thickness_to_numeric(
|
||||
self.roof["insulation_thickness"],
|
||||
self.roof["is_pitched"],
|
||||
|
|
|
|||
|
|
@ -154,6 +154,10 @@ class DataLoader:
|
|||
|
||||
asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1)
|
||||
|
||||
# Finally, we process property_type or built form, where needed
|
||||
if ha_name == "ha_6":
|
||||
asset_list["built_form"] = asset_list["Property Type"].apply(self.identify_built_form_ha6)
|
||||
|
||||
return asset_list
|
||||
|
||||
def load_survey_list(self, file_path, ha_name, asset_list, sheet_name=None):
|
||||
|
|
@ -412,6 +416,34 @@ class DataLoader:
|
|||
|
||||
return matching_lookup
|
||||
|
||||
@staticmethod
|
||||
def identify_built_form_ha6(property_string):
|
||||
"""
|
||||
Identify the built form of a property from the given string.
|
||||
|
||||
:param property_string: The string describing the property
|
||||
:return: The identified built form, or None if it cannot be identified
|
||||
"""
|
||||
# Define keywords for each built form
|
||||
built_forms = {
|
||||
'Semi-Detached': ['semi detached'],
|
||||
'Detached': ['detached'],
|
||||
'Mid-Terrace': ['mid terrace', 'mid town house'],
|
||||
'End-Terrace': ['end terrace', 'end town house']
|
||||
}
|
||||
|
||||
# Normalize the input string to lower case for comparison
|
||||
property_string_normalized = property_string.lower()
|
||||
|
||||
# Search for each built form keyword in the input string
|
||||
for built_form, keywords in built_forms.items():
|
||||
for keyword in keywords:
|
||||
if keyword in property_string_normalized:
|
||||
return built_form
|
||||
|
||||
# Return None if no built form is identified
|
||||
return None
|
||||
|
||||
def load(self):
|
||||
|
||||
if self.use_cache:
|
||||
|
|
@ -461,7 +493,7 @@ class DataLoader:
|
|||
|
||||
|
||||
def get_epc_data(
|
||||
loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds
|
||||
loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=True
|
||||
):
|
||||
if not loader.data:
|
||||
raise ValueError("Data not found - please run loader.load() first")
|
||||
|
|
@ -476,10 +508,39 @@ def get_epc_data(
|
|||
'Enclosed Mid': 'Mid-Terrace',
|
||||
'Detached Local Connect': 'Detached',
|
||||
}
|
||||
},
|
||||
"ha_6": {
|
||||
"property_type": {
|
||||
'HOUSE': "House",
|
||||
'GROUND FLOOR FLAT': "Flat",
|
||||
'UPPER FLOOR FLAT': "Flat",
|
||||
'MAISONETTE': "Maisonette",
|
||||
'BUNGALOW': "Bungalow",
|
||||
'WARDEN BUNGALOW': "Bungalow",
|
||||
'WARDEN FLAT': "Flat",
|
||||
'EXTRACARE SCHEME': "Flat",
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
outputs = {}
|
||||
for ha_name, data_assets in loader.data.items():
|
||||
|
||||
if not pull_data:
|
||||
# Then we retrieve the data from S3
|
||||
processed_ha_results = read_pickle_from_s3(
|
||||
bucket_name="retrofit-datalake-dev",
|
||||
s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle"
|
||||
)
|
||||
|
||||
outputs[ha_name] = {
|
||||
"results_df": processed_ha_results["results_df"],
|
||||
"scoring_data": processed_ha_results["scoring_df"],
|
||||
"nodata": processed_ha_results["nodata"]
|
||||
}
|
||||
continue
|
||||
|
||||
# For each HA, we read pull in the data required, and store in S3
|
||||
asset_list = data_assets["asset_list"].copy()
|
||||
|
||||
|
|
@ -490,8 +551,12 @@ def get_epc_data(
|
|||
# We iterate through the asset list and pull what we need
|
||||
results = []
|
||||
scoring_data = []
|
||||
nodata = []
|
||||
for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)):
|
||||
|
||||
if property_meta["matching_postcode"] is None:
|
||||
continue
|
||||
|
||||
if ha_name == "ha_1":
|
||||
property_type = property_meta["Asset Type"]
|
||||
# We correct a small error
|
||||
|
|
@ -503,6 +568,9 @@ def get_epc_data(
|
|||
property_type = "Flat"
|
||||
|
||||
built_form = property_type_lookup[ha_name]["built_form"].get(property_meta["Property Type"], None)
|
||||
elif ha_name == "ha_6":
|
||||
property_type = property_type_lookup[ha_name]["property_type"][property_meta["Dwelling type"]]
|
||||
built_form = property_meta["built_form"]
|
||||
else:
|
||||
raise NotImplementedError("Implement me")
|
||||
|
||||
|
|
@ -517,6 +585,10 @@ def get_epc_data(
|
|||
searcher.ordnance_survey_client.built_form = built_form
|
||||
searcher.find_property(skip_os=True)
|
||||
|
||||
if searcher.newest_epc is None:
|
||||
nodata.append(property_meta)
|
||||
continue
|
||||
|
||||
if searcher.newest_epc.get("estimated"):
|
||||
# We insert the row ID as our proxy for UPRN
|
||||
searcher.newest_epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1])
|
||||
|
|
@ -606,6 +678,7 @@ def get_epc_data(
|
|||
"cavity_age": cavity_age,
|
||||
**eligibility.walls,
|
||||
**eligibility.roof,
|
||||
"is_estimated": searcher.newest_epc.get("estimated") is not None
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -619,6 +692,10 @@ def get_epc_data(
|
|||
|
||||
model_api = ModelApi(portfolio_id="-".join([ha_name, "eligibility"]), timestamp=created_at)
|
||||
|
||||
# scoring_df["is_community"].value_counts()
|
||||
# scoring_df[scoring_df["is_community"] == "Unknown"]
|
||||
# property_meta = asset_list[asset_list["asset_list_row_id"] == "ha_67238"].squeeze()
|
||||
|
||||
all_predictions = model_api.predict_all(
|
||||
df=scoring_df,
|
||||
bucket="retrofit-data-dev",
|
||||
|
|
@ -678,8 +755,33 @@ def get_epc_data(
|
|||
}
|
||||
)
|
||||
|
||||
eligibility_assessment = pd.DataFrame(eligibility_assessment)
|
||||
|
||||
def analyse_ha_data():
|
||||
results_df = results_df.merge(
|
||||
eligibility_assessment, how="left", on="row_id"
|
||||
)
|
||||
|
||||
# We store the results in S3 as a pickle
|
||||
save_pickle_to_s3(
|
||||
data={
|
||||
"results_df": results_df,
|
||||
"scoring_data": scoring_df,
|
||||
"nodata": nodata
|
||||
},
|
||||
bucket_name="retrofit-datalake-dev",
|
||||
s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle"
|
||||
)
|
||||
|
||||
outputs[ha_name] = {
|
||||
"results_df": results_df,
|
||||
"scoring_data": scoring_df,
|
||||
"nodata": nodata
|
||||
}
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
def analyse_ha_data(outputs, loader):
|
||||
"""
|
||||
The approach we take within this function is the following:
|
||||
For properties that have been identified by warmfront as eligible properties, characterise them by scheme. The
|
||||
|
|
@ -697,6 +799,127 @@ def analyse_ha_data():
|
|||
|
||||
:return:
|
||||
"""
|
||||
|
||||
for ha_name, datasets in outputs.items():
|
||||
|
||||
# TODO: This is placeholder because we don't have the schemes that the properties have been qualified for
|
||||
# yet
|
||||
#
|
||||
import random
|
||||
randomly_allocated_schemes = random.choices(["ECO4", "GBIS"], k=inputs["asset_list"].shape[0])
|
||||
inputs["asset_list"]["randomly_allocated_schemes"] = randomly_allocated_schemes
|
||||
inputs["asset_list"]["funding_scheme"] = None
|
||||
inputs["asset_list"]["funding_scheme"] = np.where(
|
||||
inputs["asset_list"]["row_meaning"] == "identified potential eco works (CWI)",
|
||||
inputs["asset_list"]["randomly_allocated_schemes"],
|
||||
inputs["asset_list"]["funding_scheme"]
|
||||
)
|
||||
|
||||
# End placholder
|
||||
|
||||
results_df = datasets["results_df"].copy()
|
||||
|
||||
inputs = [x for k, x in loader.data.items() if k == ha_name][0]
|
||||
|
||||
analysis_data = inputs["asset_list"][['asset_list_row_id', "row_meaning", "funding_scheme"]].rename(
|
||||
columns={"row_meaning": "asset_identification_status"}
|
||||
).merge(
|
||||
results_df,
|
||||
how="left",
|
||||
right_on="row_id",
|
||||
left_on="asset_list_row_id"
|
||||
)
|
||||
|
||||
# If we have a survey list, we merge this onto the results
|
||||
|
||||
n_properties_in_asset_list = analysis_data["asset_list_row_id"].nunique()
|
||||
|
||||
properties_sold = (
|
||||
inputs["survey_list"].groupby("funding_scheme")["survey_list_row_id"].nunique().reset_index() if
|
||||
inputs["survey_list"] is not None else 0
|
||||
)
|
||||
properties_sold_eco4 = (
|
||||
properties_sold[properties_sold["funding_scheme"] == "ECO4"]["survey_list_row_id"].values[0] if
|
||||
properties_sold != 0 else 0
|
||||
)
|
||||
properties_sold_gbis = (
|
||||
properties_sold[properties_sold["funding_scheme"] == "GBIS"]["survey_list_row_id"].values[0] if
|
||||
properties_sold != 0 else 0
|
||||
)
|
||||
|
||||
# We now merge the survey list onto the analysis data and remove anything that is sold, to give us just what is
|
||||
# remaining
|
||||
|
||||
if inputs["matched_lookup"] is not None:
|
||||
analysis_data = analysis_data.merge(
|
||||
inputs["matched_lookup"], how="left", on="asset_list_row_id"
|
||||
)
|
||||
# Drop any rows that have a survey_list_row_id
|
||||
analysis_data = analysis_data[pd.isnull(analysis_data["survey_list_row_id"])]
|
||||
|
||||
# We now calculate the number of remaining properties, by scheme
|
||||
# TODO: We might need to tweak a bit of the knowledge
|
||||
remaining_properties = analysis_data[
|
||||
analysis_data["asset_identification_status"] == "identified potential eco works (CWI)"
|
||||
]
|
||||
|
||||
remaining_properties_by_scheme = (
|
||||
remaining_properties.groupby("funding_scheme")["asset_list_row_id"].nunique().reset_index()
|
||||
)
|
||||
remaining_properties_eco4 = remaining_properties_by_scheme[
|
||||
remaining_properties_by_scheme["funding_scheme"] == "ECO4"
|
||||
]["asset_list_row_id"].values[0]
|
||||
|
||||
remaining_properties_gbis = remaining_properties_by_scheme[
|
||||
remaining_properties_by_scheme["funding_scheme"] == "GBIS"
|
||||
]["asset_list_row_id"].values[0]
|
||||
|
||||
# For the remaining properties, we use the results of the eligibility process to classify the property into
|
||||
# one of multiple categories
|
||||
#
|
||||
# For properties that have been identified as ECO4
|
||||
# 1) Strict ECO4 candidate - Has required fabric and EPC is below a D
|
||||
# - This is not the very strictest definition of ECO4 eligible, but we aim to characterise the properties
|
||||
# here and re-surveying is a common practicce by Warmfront. Additionally, many of the social homes have
|
||||
# very old EPCs which may score lower when re-done
|
||||
# 2) Subject to CIGA check - Meets loft conditions but shows a filled cavity.
|
||||
# - we don't have a SAP constraint here because the EPC is (currently) showing what the property might
|
||||
# actually look like after retrofit and so the EPC currently being a C or above means little, because
|
||||
# the updated EPC, showing an empty cavity, could bring the property within
|
||||
# 3) Loft insulation too thick - Meets empty cavity but shows a loft with between 101 and 270mm insulation.
|
||||
# - No SAP constraint, for the same reason as in category 2)
|
||||
# 4) Does not look like ECO4 candidate
|
||||
#
|
||||
# For properties that have been identified as GBIS
|
||||
# 1) Strict GBIS candidates
|
||||
# 2) Properties that actually look like strict GBIS candidates
|
||||
# 3) Subject to CIGA check - Filled cavity
|
||||
# 4) Does not look like a GBIS candidate
|
||||
|
||||
# ECO4
|
||||
# 1) We identify this if:
|
||||
# - remaining_properties["eco4_eligible"] == True
|
||||
# - remaining_properties[""]
|
||||
remaining_properties[remaining_properties["eco4_eligible"] == True]["eco4_message"].value_counts()
|
||||
remaining_properties["eco4_message"].value_counts()
|
||||
z = remaining_properties[
|
||||
(remaining_properties["eco4_message"] == "Possibly eligible but property currently EPC D") &
|
||||
(remaining_properties["eco4_eligible"] == True)
|
||||
]
|
||||
|
||||
k = z[z["property_type"] == "Flat"]
|
||||
k["uprn"]
|
||||
|
||||
ha_analysis_results = {
|
||||
"n_properties_in_asset_list": n_properties_in_asset_list,
|
||||
# ECO4
|
||||
"properties_sold_eco4": properties_sold_eco4,
|
||||
"remaining_properties_eco4": remaining_properties_eco4,
|
||||
# GBIS
|
||||
"properties_sold_gbis": properties_sold_gbis,
|
||||
"remaining_properties_gbis": remaining_properties_gbis
|
||||
}
|
||||
|
||||
pass
|
||||
|
||||
|
||||
|
|
@ -789,10 +1012,10 @@ def app():
|
|||
# Patch mainheatcont-description
|
||||
cleaned["mainheatcont-description"].extend(
|
||||
[
|
||||
{'original_description': 'None', 'clean_description': 'None', 'thermostatic_control': False,
|
||||
'charging_system': False, 'switch_system': False, 'no_control': False, 'dhw_control': False,
|
||||
'community_heating': False, 'multiple_room_thermostats': False, 'auxiliary_systems': False, 'trvs': False,
|
||||
'rate_control': False}
|
||||
{'original_description': 'None', 'clean_description': 'None', 'thermostatic_control': None,
|
||||
'charging_system': None, 'switch_system': None, 'no_control': None, 'dhw_control': None,
|
||||
'community_heating': None, 'multiple_room_thermostats': False, 'auxiliary_systems': None, 'trvs': None,
|
||||
'rate_control': None}
|
||||
]
|
||||
)
|
||||
|
||||
|
|
@ -810,4 +1033,4 @@ def app():
|
|||
|
||||
photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev")
|
||||
|
||||
get_epc_data(loader)
|
||||
outputs = get_epc_data(loader)
|
||||
|
|
|
|||
|
|
@ -11,6 +11,37 @@ from recommendations.recommendation_utils import (
|
|||
get_wall_type
|
||||
)
|
||||
|
||||
# TODO: Can probably produce this in the property change app and store in S3
|
||||
BOOLEAN_VARIABLES = [
|
||||
'is_cavity_wall', 'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame',
|
||||
'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_sandstone_or_limestone', 'is_park_home',
|
||||
'external_insulation', 'internal_insulation', 'is_park_home_ending', 'external_insulation_ending',
|
||||
'internal_insulation_ending', 'is_to_unheated_space', 'is_to_external_air', 'is_suspended', 'is_solid',
|
||||
'another_property_below', 'is_pitched', 'is_roof_room', 'is_loft', 'is_flat', 'is_thatched', 'is_at_rafters',
|
||||
'has_dwelling_above', 'has_radiators', 'has_fan_coil_units', 'has_pipes_in_screed_above_insulation',
|
||||
'has_pipes_in_insulated_timber_floor', 'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump',
|
||||
'has_room_heaters', 'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating',
|
||||
'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump', 'has_no_system_present',
|
||||
'has_portable_electric_heaters', 'has_water_source_heat_pump', 'has_electric_heat_pump', 'has_micro-cogeneration',
|
||||
'has_solar_assisted_heat_pump', 'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric',
|
||||
'has_mains_gas', 'has_wood_logs', 'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite',
|
||||
'has_dual_fuel_mineral_and_wood', 'has_smokeless_fuel', 'has_lpg', 'has_b30k', 'has_electricaire',
|
||||
'has_assumed_for_most_rooms', 'has_underfloor_heating', 'has_radiators_ending', 'has_fan_coil_units_ending',
|
||||
'has_pipes_in_screed_above_insulation_ending', 'has_pipes_in_insulated_timber_floor_ending',
|
||||
'has_pipes_in_concrete_slab_ending', 'has_boiler_ending', 'has_air_source_heat_pump_ending',
|
||||
'has_room_heaters_ending', 'has_electric_storage_heaters_ending', 'has_warm_air_ending',
|
||||
'has_electric_underfloor_heating_ending', 'has_electric_ceiling_heating_ending', 'has_community_scheme_ending',
|
||||
'has_ground_source_heat_pump_ending', 'has_no_system_present_ending', 'has_portable_electric_heaters_ending',
|
||||
'has_water_source_heat_pump_ending', 'has_electric_heat_pump_ending', 'has_micro-cogeneration_ending',
|
||||
'has_solar_assisted_heat_pump_ending', 'has_exhaust_source_heat_pump_ending', 'has_community_heat_pump_ending',
|
||||
'has_electric_ending', 'has_mains_gas_ending', 'has_wood_logs_ending', 'has_coal_ending', 'has_oil_ending',
|
||||
'has_wood_pellets_ending', 'has_anthracite_ending', 'has_dual_fuel_mineral_and_wood_ending',
|
||||
'has_smokeless_fuel_ending', 'has_lpg_ending', 'has_b30k_ending', 'has_electricaire_ending',
|
||||
'has_assumed_for_most_rooms_ending', 'has_underfloor_heating_ending', 'multiple_room_thermostats',
|
||||
'multiple_room_thermostats_ending', 'is_community', 'no_individual_heating_or_community_network',
|
||||
'is_community_ending', 'no_individual_heating_or_community_network_ending'
|
||||
]
|
||||
|
||||
|
||||
class BaseDataset:
|
||||
"""
|
||||
|
|
@ -439,7 +470,7 @@ class TrainingDataset(BaseDataset):
|
|||
|
||||
for col in missings.index:
|
||||
unique_values = self.df[col].unique()
|
||||
if True in unique_values or False in unique_values:
|
||||
if (True in unique_values) or (False in unique_values) or (col in BOOLEAN_VARIABLES):
|
||||
self.df[col] = self.df[col].fillna(False)
|
||||
if "none" in unique_values:
|
||||
self.df[col] = self.df[col].fillna("none")
|
||||
|
|
|
|||
|
|
@ -46,6 +46,8 @@ DATA_ANOMALY_MATCHES = {
|
|||
"",
|
||||
# We sometimes find None values - particulatly when we produce an estimated EPC
|
||||
None,
|
||||
# An older value which rarely shows up but has been seen in the data.
|
||||
"UNKNOWN",
|
||||
}
|
||||
|
||||
DATA_ANOMALY_SUBSTRINGS = {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue