Added potential variables to dataset

This commit is contained in:
Khalim Conn-Kowlessar 2023-10-17 15:38:17 +11:00
parent 2ead4906be
commit 40a6d2041e
5 changed files with 108 additions and 14 deletions

2
.idea/Model.iml generated
View file

@ -7,7 +7,7 @@
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
<component name="PyNamespacePackagesService">

2
.idea/misc.xml generated
View file

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
<component name="PythonCompatibilityInspectionAdvertiser">
<option name="version" value="3" />
</component>

View file

@ -569,3 +569,20 @@ class DataProcessor:
df[col] = df[col].fillna("Unknown")
return df
@staticmethod
def clean_efficiency_variables(df):
missings = pd.isnull(df).sum()
missings = missings[missings >= 1]
if len(missings) == 0:
return df
# Make sure they are all efficiency columns
if any(~missings.index.str.contains("ENERGY_EFF")):
raise ValueError("Non efficiency columns are missing")
for m in missings.index:
df[m] = df[m].fillna("NO_RATING")
return df

View file

@ -12,6 +12,10 @@ from etl.epc.settings import (
HEAT_DEMAND_RESPONSE,
COLUMNS_TO_MERGE_ON,
CARBON_RESPONSE,
CORE_COMPONENT_FEATURES,
EFFICIENCY_FEATURES,
POTENTIAL_COLUMNS,
MINIMUM_FLOOR_HEIGHT
)
from etl.epc.DataProcessor import DataProcessor
from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
@ -363,6 +367,25 @@ def make_uvalues(df):
return df
def compare_records(earliest_record: pd.Series, latest_record: pd.Series, columns: list):
"""
For a list of columns, check if the earliest and latest record are the same
If they are the same, we indicate this, because we have example of SAP scores changing
without any feature changes
:param earliest_record: pd.Series
:param latest_record: pd.Series
:param columns: list of columns to compare
:return: boolean indicating whether or not all features are the same
"""
all_equal = True
for col in columns:
if earliest_record[col] != latest_record[col]:
return False
if all_equal:
return True
def app():
# Get all the files in the directory
@ -376,6 +399,8 @@ def app():
dataset = []
cleaning_dataset = []
# Keep track of the number of all equals
all_equal_count = 0
for directory in tqdm(directories):
@ -422,7 +447,9 @@ def app():
# We include the lodgement date here as we probably need to factor time into the
# model, since EPC standards and rigour have changed over time
variable_data = property_data[
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE]
COMPONENT_FEATURES + EFFICIENCY_FEATURES + POTENTIAL_COLUMNS + [
"LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE
]
]
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
@ -439,6 +466,8 @@ def app():
# Check if the sap gets better or worse
gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE]
component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES
if gets_better:
starting_sap = earliest_record[RDSAP_RESPONSE]
starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
@ -452,8 +481,8 @@ def app():
heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon
starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
starting_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
ending_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
else:
starting_sap = latest_record[RDSAP_RESPONSE]
starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
@ -467,12 +496,23 @@ def app():
heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon
starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
starting_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
ending_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
if rdsap_change == 0:
continue
all_equal = compare_records(
earliest_record=earliest_record,
latest_record=latest_record,
columns=CORE_COMPONENT_FEATURES
)
if all_equal:
# Keep track of this for the moment
all_equal_count += 1
continue
features = pd.concat([starting_record, ending_record])
property_model_data.append(
@ -487,6 +527,10 @@ def app():
"HEAT_DEMAND_ENDING": ending_heat_demand,
"CARBON_STARTING": starting_carbon,
"CARBON_ENDING": ending_carbon,
"POTENTIAL_ENERGY_EFFICIENCY": earliest_record["POTENTIAL_ENERGY_EFFICIENCY"],
"ENVIRONMENT_IMPACT_POTENTIAL": earliest_record["ENVIRONMENT_IMPACT_POTENTIAL"],
"ENERGY_CONSUMPTION_POTENTIAL": earliest_record["ENERGY_CONSUMPTION_POTENTIAL"],
"CO2_EMISSIONS_POTENTIAL": earliest_record["CO2_EMISSIONS_POTENTIAL"],
**fixed_data,
**features.to_dict(),
}
@ -496,8 +540,6 @@ def app():
data_by_urpn_df = pd.DataFrame(data_by_urpn)
# Add some temporal features - we look at the days from the standard starting point in time
# for the starting and ending date so all records are from a fixed point
data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to(
data_by_urpn_df["LODGEMENT_DATE_STARTING"]
)
@ -508,6 +550,8 @@ def app():
data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])
data_by_urpn_df = DataProcessor.clean_efficiency_variables(data_by_urpn_df)
# We look for key building fabric features that have changed from one EPC to the next.
# if, for example, we see that a home has gone from being a cavity wall to a solid wall, we
# remove this record, as it indicates that the quality of the EPC conducted in the first instance
@ -541,6 +585,8 @@ def app():
cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]
cleaning_dataset.append(cleaning_averages)
print("Final all equal count: %s" % str(all_equal_count))
# Store cleaning dataset in s3 as a parquet file
cleaning_dataset = pd.concat(cleaning_dataset)
save_dataframe_to_s3_parquet(

View file

@ -85,8 +85,7 @@ FIXED_FEATURES = [
"FIXED_LIGHTING_OUTLETS_COUNT",
]
COMPONENT_FEATURES = [
"TRANSACTION_TYPE",
CORE_COMPONENT_FEATURES = [
"WALLS_DESCRIPTION",
"FLOOR_DESCRIPTION",
"LIGHTING_DESCRIPTION",
@ -96,21 +95,49 @@ COMPONENT_FEATURES = [
"MAIN_FUEL",
"MECHANICAL_VENTILATION",
"SECONDHEAT_DESCRIPTION",
"ENERGY_TARIFF", # Not sure if this is relevant
"SOLAR_WATER_HEATING_FLAG",
"PHOTO_SUPPLY",
"WINDOWS_DESCRIPTION",
"GLAZED_TYPE",
"MULTI_GLAZE_PROPORTION",
"LOW_ENERGY_LIGHTING",
"NUMBER_OPEN_FIREPLACES",
"MAINHEATCONT_DESCRIPTION",
"SOLAR_WATER_HEATING_FLAG",
"PHOTO_SUPPLY",
]
EFFICIENCY_FEATURES = [
'HOT_WATER_ENERGY_EFF',
'FLOOR_ENERGY_EFF',
'WINDOWS_ENERGY_EFF',
'WALLS_ENERGY_EFF',
'SHEATING_ENERGY_EFF',
'ROOF_ENERGY_EFF',
'MAINHEAT_ENERGY_EFF',
'MAINHEATC_ENERGY_EFF',
'LIGHTING_ENERGY_EFF'
]
COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [
"TRANSACTION_TYPE",
"ENERGY_TARIFF", # Not sure if this is relevant
"EXTENSION_COUNT",
"TOTAL_FLOOR_AREA",
"FLOOR_HEIGHT",
# 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION
]
POTENTIAL_COLUMNS = [
'POTENTIAL_ENERGY_RATING',
'POTENTIAL_ENERGY_EFFICIENCY',
'ENVIRONMENT_IMPACT_POTENTIAL',
'ENERGY_CONSUMPTION_POTENTIAL',
'CO2_EMISSIONS_POTENTIAL',
# We don't include cost features for the moment
# 'LIGHTING_COST_POTENTIAL',
# 'HEATING_COST_POTENTIAL',
# 'HOT_WATER_COST_POTENTIAL'
]
# For these fields, we take the latest value if we have multiple values
# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
# the most accurate
@ -253,3 +280,7 @@ ENDING_SUFFIX_COMPONENT_COLS = [
'rate_control', 'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community',
'no_individual_heating_or_community_network', 'complex_fuel_type', 'estimated_perimeter'
]
# We found that without performing any filtering, the bottom 0.5% of homes had a floor height of 1.65m. We'll therefore
# filter out any homes with a floor height below this
MINIMUM_FLOOR_HEIGHT = 1.65