Added booleans to clean missings

This commit is contained in:
Khalim Conn-Kowlessar 2024-01-24 21:21:01 +00:00
parent edb541f3dc
commit ef27d6b164
6 changed files with 288 additions and 12 deletions

View file

@ -45,7 +45,9 @@ class Definitions:
# contain a null value. A resolution to correct these anomalies will be considered for future data releases.
"NULL",
# We sometimes see fields populated with just an empty string.
""
"",
# An older value which rarely shows up but has been seen in the data.
"UNKNOWN",
}
DATA_ANOMALY_SUBSTRINGS = {

View file

@ -13,7 +13,7 @@ from etl.epc_clean.epc_attributes.all_cleaners import all_cleaner_map
from etl.solar.SolarPhotoSupply import SolarPhotoSupply
from utils.logger import setup_logger
from utils.s3 import read_dataframe_from_s3_parquet
from BaseUtility import Definitions
from etl.epc.settings import DATA_ANOMALY_MATCHES
from recommendations.rdsap_tables import england_wales_age_band_lookup, FLOOR_LEVEL_MAP
from recommendations.recommendation_utils import (
estimate_perimeter, get_wall_type, estimate_external_wall_area, esimtate_pitched_roof_area, estimate_windows
@ -25,7 +25,7 @@ DATA_BUCKET = os.environ.get('DATA_BUCKET', 'retrofit-data-dev' if ENVIRONMENT =
logger = setup_logger()
class Property(Definitions):
class Property:
ATTRIBUTE_MAP = {
"floor-description": "floor",
"hotwater-description": "hotwater",
@ -51,6 +51,8 @@ class Property(Definitions):
spatial = None
base_difference_record = None
DATA_ANOMALY_MATCHES = DATA_ANOMALY_MATCHES
def __init__(self, id, postcode, address, epc_record):
self.epc_record = epc_record
@ -302,6 +304,7 @@ class Property(Definitions):
self.set_basic_property_dimensions()
for description, attribute in cleaned.items():
if self.data[description] in self.DATA_ANOMALY_MATCHES:
template = cleaned[description][0]
fill_dict = dict(zip(template.keys(), [None] * len(template)))
@ -319,7 +322,7 @@ class Property(Definitions):
attributes = [
x for x in cleaned[description] if x["original_description"] == self.data[description]
]
if len(attributes) > 1:
raise ValueError("Either No attributes or multiple found for %s" % description)

View file

@ -233,6 +233,13 @@ class Eligibility:
def room_roof_insulation(self):
is_room_roof = self.roof["is_roof_room"]
if not is_room_roof:
self.room_roof = {
"suitability": False,
"thickness": None
}
return
insulation_thickness = convert_thickness_to_numeric(
self.roof["insulation_thickness"],
self.roof["is_pitched"],
@ -246,6 +253,14 @@ class Eligibility:
def flat_roof_insulation(self):
is_flat = self.roof["is_flat"]
if not is_flat:
self.flat_roof = {
"suitability": False,
"thickness": None
}
return
insulation_thickness = convert_thickness_to_numeric(
self.roof["insulation_thickness"],
self.roof["is_pitched"],

View file

@ -154,6 +154,10 @@ class DataLoader:
asset_list = pd.concat([asset_list, house_numbers[["HouseNo"]]], axis=1)
# Finally, we process property_type or built form, where needed
if ha_name == "ha_6":
asset_list["built_form"] = asset_list["Property Type"].apply(self.identify_built_form_ha6)
return asset_list
def load_survey_list(self, file_path, ha_name, asset_list, sheet_name=None):
@ -412,6 +416,34 @@ class DataLoader:
return matching_lookup
@staticmethod
def identify_built_form_ha6(property_string):
"""
Identify the built form of a property from the given string.
:param property_string: The string describing the property
:return: The identified built form, or None if it cannot be identified
"""
# Define keywords for each built form
built_forms = {
'Semi-Detached': ['semi detached'],
'Detached': ['detached'],
'Mid-Terrace': ['mid terrace', 'mid town house'],
'End-Terrace': ['end terrace', 'end town house']
}
# Normalize the input string to lower case for comparison
property_string_normalized = property_string.lower()
# Search for each built form keyword in the input string
for built_form, keywords in built_forms.items():
for keyword in keywords:
if keyword in property_string_normalized:
return built_form
# Return None if no built form is identified
return None
def load(self):
if self.use_cache:
@ -461,7 +493,7 @@ class DataLoader:
def get_epc_data(
loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds
loader, cleaned, cleaning_data, created_at, photo_supply_lookup, floor_area_decile_thresholds, pull_data=True
):
if not loader.data:
raise ValueError("Data not found - please run loader.load() first")
@ -476,10 +508,39 @@ def get_epc_data(
'Enclosed Mid': 'Mid-Terrace',
'Detached Local Connect': 'Detached',
}
},
"ha_6": {
"property_type": {
'HOUSE': "House",
'GROUND FLOOR FLAT': "Flat",
'UPPER FLOOR FLAT': "Flat",
'MAISONETTE': "Maisonette",
'BUNGALOW': "Bungalow",
'WARDEN BUNGALOW': "Bungalow",
'WARDEN FLAT': "Flat",
'EXTRACARE SCHEME': "Flat",
}
}
}
outputs = {}
for ha_name, data_assets in loader.data.items():
if not pull_data:
# Then we retrieve the data from S3
processed_ha_results = read_pickle_from_s3(
bucket_name="retrofit-datalake-dev",
s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle"
)
outputs[ha_name] = {
"results_df": processed_ha_results["results_df"],
"scoring_data": processed_ha_results["scoring_df"],
"nodata": processed_ha_results["nodata"]
}
continue
# For each HA, we read pull in the data required, and store in S3
asset_list = data_assets["asset_list"].copy()
@ -490,8 +551,12 @@ def get_epc_data(
# We iterate through the asset list and pull what we need
results = []
scoring_data = []
nodata = []
for index, property_meta in tqdm(asset_list.iterrows(), total=len(asset_list)):
if property_meta["matching_postcode"] is None:
continue
if ha_name == "ha_1":
property_type = property_meta["Asset Type"]
# We correct a small error
@ -503,6 +568,9 @@ def get_epc_data(
property_type = "Flat"
built_form = property_type_lookup[ha_name]["built_form"].get(property_meta["Property Type"], None)
elif ha_name == "ha_6":
property_type = property_type_lookup[ha_name]["property_type"][property_meta["Dwelling type"]]
built_form = property_meta["built_form"]
else:
raise NotImplementedError("Implement me")
@ -517,6 +585,10 @@ def get_epc_data(
searcher.ordnance_survey_client.built_form = built_form
searcher.find_property(skip_os=True)
if searcher.newest_epc is None:
nodata.append(property_meta)
continue
if searcher.newest_epc.get("estimated"):
# We insert the row ID as our proxy for UPRN
searcher.newest_epc["uprn"] = int(property_meta["asset_list_row_id"].split(ha_name)[1])
@ -606,6 +678,7 @@ def get_epc_data(
"cavity_age": cavity_age,
**eligibility.walls,
**eligibility.roof,
"is_estimated": searcher.newest_epc.get("estimated") is not None
}
)
@ -619,6 +692,10 @@ def get_epc_data(
model_api = ModelApi(portfolio_id="-".join([ha_name, "eligibility"]), timestamp=created_at)
# scoring_df["is_community"].value_counts()
# scoring_df[scoring_df["is_community"] == "Unknown"]
# property_meta = asset_list[asset_list["asset_list_row_id"] == "ha_67238"].squeeze()
all_predictions = model_api.predict_all(
df=scoring_df,
bucket="retrofit-data-dev",
@ -678,8 +755,33 @@ def get_epc_data(
}
)
eligibility_assessment = pd.DataFrame(eligibility_assessment)
def analyse_ha_data():
results_df = results_df.merge(
eligibility_assessment, how="left", on="row_id"
)
# We store the results in S3 as a pickle
save_pickle_to_s3(
data={
"results_df": results_df,
"scoring_data": scoring_df,
"nodata": nodata
},
bucket_name="retrofit-datalake-dev",
s3_file_name=f"ha-analysis/{ha_name}/processed_results.pickle"
)
outputs[ha_name] = {
"results_df": results_df,
"scoring_data": scoring_df,
"nodata": nodata
}
return outputs
def analyse_ha_data(outputs, loader):
"""
The approach we take within this function is the following:
For properties that have been identified by warmfront as eligible properties, characterise them by scheme. The
@ -697,6 +799,127 @@ def analyse_ha_data():
:return:
"""
for ha_name, datasets in outputs.items():
# TODO: This is placeholder because we don't have the schemes that the properties have been qualified for
# yet
#
import random
randomly_allocated_schemes = random.choices(["ECO4", "GBIS"], k=inputs["asset_list"].shape[0])
inputs["asset_list"]["randomly_allocated_schemes"] = randomly_allocated_schemes
inputs["asset_list"]["funding_scheme"] = None
inputs["asset_list"]["funding_scheme"] = np.where(
inputs["asset_list"]["row_meaning"] == "identified potential eco works (CWI)",
inputs["asset_list"]["randomly_allocated_schemes"],
inputs["asset_list"]["funding_scheme"]
)
# End placholder
results_df = datasets["results_df"].copy()
inputs = [x for k, x in loader.data.items() if k == ha_name][0]
analysis_data = inputs["asset_list"][['asset_list_row_id', "row_meaning", "funding_scheme"]].rename(
columns={"row_meaning": "asset_identification_status"}
).merge(
results_df,
how="left",
right_on="row_id",
left_on="asset_list_row_id"
)
# If we have a survey list, we merge this onto the results
n_properties_in_asset_list = analysis_data["asset_list_row_id"].nunique()
properties_sold = (
inputs["survey_list"].groupby("funding_scheme")["survey_list_row_id"].nunique().reset_index() if
inputs["survey_list"] is not None else 0
)
properties_sold_eco4 = (
properties_sold[properties_sold["funding_scheme"] == "ECO4"]["survey_list_row_id"].values[0] if
properties_sold != 0 else 0
)
properties_sold_gbis = (
properties_sold[properties_sold["funding_scheme"] == "GBIS"]["survey_list_row_id"].values[0] if
properties_sold != 0 else 0
)
# We now merge the survey list onto the analysis data and remove anything that is sold, to give us just what is
# remaining
if inputs["matched_lookup"] is not None:
analysis_data = analysis_data.merge(
inputs["matched_lookup"], how="left", on="asset_list_row_id"
)
# Drop any rows that have a survey_list_row_id
analysis_data = analysis_data[pd.isnull(analysis_data["survey_list_row_id"])]
# We now calculate the number of remaining properties, by scheme
# TODO: We might need to tweak a bit of the knowledge
remaining_properties = analysis_data[
analysis_data["asset_identification_status"] == "identified potential eco works (CWI)"
]
remaining_properties_by_scheme = (
remaining_properties.groupby("funding_scheme")["asset_list_row_id"].nunique().reset_index()
)
remaining_properties_eco4 = remaining_properties_by_scheme[
remaining_properties_by_scheme["funding_scheme"] == "ECO4"
]["asset_list_row_id"].values[0]
remaining_properties_gbis = remaining_properties_by_scheme[
remaining_properties_by_scheme["funding_scheme"] == "GBIS"
]["asset_list_row_id"].values[0]
# For the remaining properties, we use the results of the eligibility process to classify the property into
# one of multiple categories
#
# For properties that have been identified as ECO4
# 1) Strict ECO4 candidate - Has required fabric and EPC is below a D
# - This is not the very strictest definition of ECO4 eligible, but we aim to characterise the properties
# here and re-surveying is a common practicce by Warmfront. Additionally, many of the social homes have
# very old EPCs which may score lower when re-done
# 2) Subject to CIGA check - Meets loft conditions but shows a filled cavity.
# - we don't have a SAP constraint here because the EPC is (currently) showing what the property might
# actually look like after retrofit and so the EPC currently being a C or above means little, because
# the updated EPC, showing an empty cavity, could bring the property within
# 3) Loft insulation too thick - Meets empty cavity but shows a loft with between 101 and 270mm insulation.
# - No SAP constraint, for the same reason as in category 2)
# 4) Does not look like ECO4 candidate
#
# For properties that have been identified as GBIS
# 1) Strict GBIS candidates
# 2) Properties that actually look like strict GBIS candidates
# 3) Subject to CIGA check - Filled cavity
# 4) Does not look like a GBIS candidate
# ECO4
# 1) We identify this if:
# - remaining_properties["eco4_eligible"] == True
# - remaining_properties[""]
remaining_properties[remaining_properties["eco4_eligible"] == True]["eco4_message"].value_counts()
remaining_properties["eco4_message"].value_counts()
z = remaining_properties[
(remaining_properties["eco4_message"] == "Possibly eligible but property currently EPC D") &
(remaining_properties["eco4_eligible"] == True)
]
k = z[z["property_type"] == "Flat"]
k["uprn"]
ha_analysis_results = {
"n_properties_in_asset_list": n_properties_in_asset_list,
# ECO4
"properties_sold_eco4": properties_sold_eco4,
"remaining_properties_eco4": remaining_properties_eco4,
# GBIS
"properties_sold_gbis": properties_sold_gbis,
"remaining_properties_gbis": remaining_properties_gbis
}
pass
@ -789,10 +1012,10 @@ def app():
# Patch mainheatcont-description
cleaned["mainheatcont-description"].extend(
[
{'original_description': 'None', 'clean_description': 'None', 'thermostatic_control': False,
'charging_system': False, 'switch_system': False, 'no_control': False, 'dhw_control': False,
'community_heating': False, 'multiple_room_thermostats': False, 'auxiliary_systems': False, 'trvs': False,
'rate_control': False}
{'original_description': 'None', 'clean_description': 'None', 'thermostatic_control': None,
'charging_system': None, 'switch_system': None, 'no_control': None, 'dhw_control': None,
'community_heating': None, 'multiple_room_thermostats': False, 'auxiliary_systems': None, 'trvs': None,
'rate_control': None}
]
)
@ -810,4 +1033,4 @@ def app():
photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev")
get_epc_data(loader)
outputs = get_epc_data(loader)

View file

@ -11,6 +11,37 @@ from recommendations.recommendation_utils import (
get_wall_type
)
# TODO: Can probably produce this in the property change app and store in S3
BOOLEAN_VARIABLES = [
'is_cavity_wall', 'is_filled_cavity', 'is_solid_brick', 'is_system_built', 'is_timber_frame',
'is_granite_or_whinstone', 'is_as_built', 'is_cob', 'is_sandstone_or_limestone', 'is_park_home',
'external_insulation', 'internal_insulation', 'is_park_home_ending', 'external_insulation_ending',
'internal_insulation_ending', 'is_to_unheated_space', 'is_to_external_air', 'is_suspended', 'is_solid',
'another_property_below', 'is_pitched', 'is_roof_room', 'is_loft', 'is_flat', 'is_thatched', 'is_at_rafters',
'has_dwelling_above', 'has_radiators', 'has_fan_coil_units', 'has_pipes_in_screed_above_insulation',
'has_pipes_in_insulated_timber_floor', 'has_pipes_in_concrete_slab', 'has_boiler', 'has_air_source_heat_pump',
'has_room_heaters', 'has_electric_storage_heaters', 'has_warm_air', 'has_electric_underfloor_heating',
'has_electric_ceiling_heating', 'has_community_scheme', 'has_ground_source_heat_pump', 'has_no_system_present',
'has_portable_electric_heaters', 'has_water_source_heat_pump', 'has_electric_heat_pump', 'has_micro-cogeneration',
'has_solar_assisted_heat_pump', 'has_exhaust_source_heat_pump', 'has_community_heat_pump', 'has_electric',
'has_mains_gas', 'has_wood_logs', 'has_coal', 'has_oil', 'has_wood_pellets', 'has_anthracite',
'has_dual_fuel_mineral_and_wood', 'has_smokeless_fuel', 'has_lpg', 'has_b30k', 'has_electricaire',
'has_assumed_for_most_rooms', 'has_underfloor_heating', 'has_radiators_ending', 'has_fan_coil_units_ending',
'has_pipes_in_screed_above_insulation_ending', 'has_pipes_in_insulated_timber_floor_ending',
'has_pipes_in_concrete_slab_ending', 'has_boiler_ending', 'has_air_source_heat_pump_ending',
'has_room_heaters_ending', 'has_electric_storage_heaters_ending', 'has_warm_air_ending',
'has_electric_underfloor_heating_ending', 'has_electric_ceiling_heating_ending', 'has_community_scheme_ending',
'has_ground_source_heat_pump_ending', 'has_no_system_present_ending', 'has_portable_electric_heaters_ending',
'has_water_source_heat_pump_ending', 'has_electric_heat_pump_ending', 'has_micro-cogeneration_ending',
'has_solar_assisted_heat_pump_ending', 'has_exhaust_source_heat_pump_ending', 'has_community_heat_pump_ending',
'has_electric_ending', 'has_mains_gas_ending', 'has_wood_logs_ending', 'has_coal_ending', 'has_oil_ending',
'has_wood_pellets_ending', 'has_anthracite_ending', 'has_dual_fuel_mineral_and_wood_ending',
'has_smokeless_fuel_ending', 'has_lpg_ending', 'has_b30k_ending', 'has_electricaire_ending',
'has_assumed_for_most_rooms_ending', 'has_underfloor_heating_ending', 'multiple_room_thermostats',
'multiple_room_thermostats_ending', 'is_community', 'no_individual_heating_or_community_network',
'is_community_ending', 'no_individual_heating_or_community_network_ending'
]
class BaseDataset:
"""
@ -439,7 +470,7 @@ class TrainingDataset(BaseDataset):
for col in missings.index:
unique_values = self.df[col].unique()
if True in unique_values or False in unique_values:
if (True in unique_values) or (False in unique_values) or (col in BOOLEAN_VARIABLES):
self.df[col] = self.df[col].fillna(False)
if "none" in unique_values:
self.df[col] = self.df[col].fillna("none")

View file

@ -46,6 +46,8 @@ DATA_ANOMALY_MATCHES = {
"",
# We sometimes find None values - particulatly when we produce an estimated EPC
None,
# An older value which rarely shows up but has been seen in the data.
"UNKNOWN",
}
DATA_ANOMALY_SUBSTRINGS = {