mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Added potential variables to dataset
This commit is contained in:
parent
2ead4906be
commit
40a6d2041e
5 changed files with 108 additions and 14 deletions
2
.idea/Model.iml
generated
2
.idea/Model.iml
generated
|
|
@ -7,7 +7,7 @@
|
|||
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
<component name="PyNamespacePackagesService">
|
||||
|
|
|
|||
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
|
|
@ -1,6 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
|
||||
<component name="PythonCompatibilityInspectionAdvertiser">
|
||||
<option name="version" value="3" />
|
||||
</component>
|
||||
|
|
|
|||
|
|
@ -569,3 +569,20 @@ class DataProcessor:
|
|||
df[col] = df[col].fillna("Unknown")
|
||||
|
||||
return df
|
||||
|
||||
@staticmethod
|
||||
def clean_efficiency_variables(df):
|
||||
missings = pd.isnull(df).sum()
|
||||
missings = missings[missings >= 1]
|
||||
|
||||
if len(missings) == 0:
|
||||
return df
|
||||
|
||||
# Make sure they are all efficiency columns
|
||||
if any(~missings.index.str.contains("ENERGY_EFF")):
|
||||
raise ValueError("Non efficiency columns are missing")
|
||||
|
||||
for m in missings.index:
|
||||
df[m] = df[m].fillna("NO_RATING")
|
||||
|
||||
return df
|
||||
|
|
|
|||
|
|
@ -12,6 +12,10 @@ from etl.epc.settings import (
|
|||
HEAT_DEMAND_RESPONSE,
|
||||
COLUMNS_TO_MERGE_ON,
|
||||
CARBON_RESPONSE,
|
||||
CORE_COMPONENT_FEATURES,
|
||||
EFFICIENCY_FEATURES,
|
||||
POTENTIAL_COLUMNS,
|
||||
MINIMUM_FLOOR_HEIGHT
|
||||
)
|
||||
from etl.epc.DataProcessor import DataProcessor
|
||||
from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3
|
||||
|
|
@ -363,6 +367,25 @@ def make_uvalues(df):
|
|||
return df
|
||||
|
||||
|
||||
def compare_records(earliest_record: pd.Series, latest_record: pd.Series, columns: list):
|
||||
"""
|
||||
For a list of columns, check if the earliest and latest record are the same
|
||||
If they are the same, we indicate this, because we have example of SAP scores changing
|
||||
without any feature changes
|
||||
:param earliest_record: pd.Series
|
||||
:param latest_record: pd.Series
|
||||
:param columns: list of columns to compare
|
||||
:return: boolean indicating whether or not all features are the same
|
||||
"""
|
||||
|
||||
all_equal = True
|
||||
for col in columns:
|
||||
if earliest_record[col] != latest_record[col]:
|
||||
return False
|
||||
if all_equal:
|
||||
return True
|
||||
|
||||
|
||||
def app():
|
||||
# Get all the files in the directory
|
||||
|
||||
|
|
@ -376,6 +399,8 @@ def app():
|
|||
|
||||
dataset = []
|
||||
cleaning_dataset = []
|
||||
# Keep track of the number of all equals
|
||||
all_equal_count = 0
|
||||
|
||||
for directory in tqdm(directories):
|
||||
|
||||
|
|
@ -422,7 +447,9 @@ def app():
|
|||
# We include the lodgement date here as we probably need to factor time into the
|
||||
# model, since EPC standards and rigour have changed over time
|
||||
variable_data = property_data[
|
||||
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE]
|
||||
COMPONENT_FEATURES + EFFICIENCY_FEATURES + POTENTIAL_COLUMNS + [
|
||||
"LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE
|
||||
]
|
||||
]
|
||||
|
||||
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
|
||||
|
|
@ -439,6 +466,8 @@ def app():
|
|||
# Check if the sap gets better or worse
|
||||
gets_better = earliest_record[RDSAP_RESPONSE] <= latest_record[RDSAP_RESPONSE]
|
||||
|
||||
component_variables = COMPONENT_FEATURES + EFFICIENCY_FEATURES
|
||||
|
||||
if gets_better:
|
||||
starting_sap = earliest_record[RDSAP_RESPONSE]
|
||||
starting_heat_demand = earliest_record[HEAT_DEMAND_RESPONSE]
|
||||
|
|
@ -452,8 +481,8 @@ def app():
|
|||
heat_demand_change = latest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
|
||||
carbon_change = latest_record[CARBON_RESPONSE] - starting_carbon
|
||||
|
||||
starting_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
||||
ending_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
||||
starting_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
||||
ending_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
||||
else:
|
||||
starting_sap = latest_record[RDSAP_RESPONSE]
|
||||
starting_heat_demand = latest_record[HEAT_DEMAND_RESPONSE]
|
||||
|
|
@ -467,12 +496,23 @@ def app():
|
|||
heat_demand_change = earliest_record[HEAT_DEMAND_RESPONSE] - starting_heat_demand
|
||||
carbon_change = earliest_record[CARBON_RESPONSE] - starting_carbon
|
||||
|
||||
starting_record = latest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
||||
ending_record = earliest_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
||||
starting_record = latest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
||||
ending_record = earliest_record[component_variables + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
||||
|
||||
if rdsap_change == 0:
|
||||
continue
|
||||
|
||||
all_equal = compare_records(
|
||||
earliest_record=earliest_record,
|
||||
latest_record=latest_record,
|
||||
columns=CORE_COMPONENT_FEATURES
|
||||
)
|
||||
|
||||
if all_equal:
|
||||
# Keep track of this for the moment
|
||||
all_equal_count += 1
|
||||
continue
|
||||
|
||||
features = pd.concat([starting_record, ending_record])
|
||||
|
||||
property_model_data.append(
|
||||
|
|
@ -487,6 +527,10 @@ def app():
|
|||
"HEAT_DEMAND_ENDING": ending_heat_demand,
|
||||
"CARBON_STARTING": starting_carbon,
|
||||
"CARBON_ENDING": ending_carbon,
|
||||
"POTENTIAL_ENERGY_EFFICIENCY": earliest_record["POTENTIAL_ENERGY_EFFICIENCY"],
|
||||
"ENVIRONMENT_IMPACT_POTENTIAL": earliest_record["ENVIRONMENT_IMPACT_POTENTIAL"],
|
||||
"ENERGY_CONSUMPTION_POTENTIAL": earliest_record["ENERGY_CONSUMPTION_POTENTIAL"],
|
||||
"CO2_EMISSIONS_POTENTIAL": earliest_record["CO2_EMISSIONS_POTENTIAL"],
|
||||
**fixed_data,
|
||||
**features.to_dict(),
|
||||
}
|
||||
|
|
@ -496,8 +540,6 @@ def app():
|
|||
|
||||
data_by_urpn_df = pd.DataFrame(data_by_urpn)
|
||||
|
||||
# Add some temporal features - we look at the days from the standard starting point in time
|
||||
# for the starting and ending date so all records are from a fixed point
|
||||
data_by_urpn_df["DAYS_TO_STARTING"] = DataProcessor.calculate_days_to(
|
||||
data_by_urpn_df["LODGEMENT_DATE_STARTING"]
|
||||
)
|
||||
|
|
@ -508,6 +550,8 @@ def app():
|
|||
|
||||
data_by_urpn_df = data_by_urpn_df.drop(columns=["LODGEMENT_DATE_STARTING", "LODGEMENT_DATE_ENDING"])
|
||||
|
||||
data_by_urpn_df = DataProcessor.clean_efficiency_variables(data_by_urpn_df)
|
||||
|
||||
# We look for key building fabric features that have changed from one EPC to the next.
|
||||
# if, for example, we see that a home has gone from being a cavity wall to a solid wall, we
|
||||
# remove this record, as it indicates that the quality of the EPC conducted in the first instance
|
||||
|
|
@ -541,6 +585,8 @@ def app():
|
|||
cleaning_averages["LOCAL_AUTHORITY"] = df["LOCAL_AUTHORITY"].values[0]
|
||||
cleaning_dataset.append(cleaning_averages)
|
||||
|
||||
print("Final all equal count: %s" % str(all_equal_count))
|
||||
|
||||
# Store cleaning dataset in s3 as a parquet file
|
||||
cleaning_dataset = pd.concat(cleaning_dataset)
|
||||
save_dataframe_to_s3_parquet(
|
||||
|
|
|
|||
|
|
@ -85,8 +85,7 @@ FIXED_FEATURES = [
|
|||
"FIXED_LIGHTING_OUTLETS_COUNT",
|
||||
]
|
||||
|
||||
COMPONENT_FEATURES = [
|
||||
"TRANSACTION_TYPE",
|
||||
CORE_COMPONENT_FEATURES = [
|
||||
"WALLS_DESCRIPTION",
|
||||
"FLOOR_DESCRIPTION",
|
||||
"LIGHTING_DESCRIPTION",
|
||||
|
|
@ -96,21 +95,49 @@ COMPONENT_FEATURES = [
|
|||
"MAIN_FUEL",
|
||||
"MECHANICAL_VENTILATION",
|
||||
"SECONDHEAT_DESCRIPTION",
|
||||
"ENERGY_TARIFF", # Not sure if this is relevant
|
||||
"SOLAR_WATER_HEATING_FLAG",
|
||||
"PHOTO_SUPPLY",
|
||||
"WINDOWS_DESCRIPTION",
|
||||
"GLAZED_TYPE",
|
||||
"MULTI_GLAZE_PROPORTION",
|
||||
"LOW_ENERGY_LIGHTING",
|
||||
"NUMBER_OPEN_FIREPLACES",
|
||||
"MAINHEATCONT_DESCRIPTION",
|
||||
"SOLAR_WATER_HEATING_FLAG",
|
||||
"PHOTO_SUPPLY",
|
||||
]
|
||||
|
||||
EFFICIENCY_FEATURES = [
|
||||
'HOT_WATER_ENERGY_EFF',
|
||||
'FLOOR_ENERGY_EFF',
|
||||
'WINDOWS_ENERGY_EFF',
|
||||
'WALLS_ENERGY_EFF',
|
||||
'SHEATING_ENERGY_EFF',
|
||||
'ROOF_ENERGY_EFF',
|
||||
'MAINHEAT_ENERGY_EFF',
|
||||
'MAINHEATC_ENERGY_EFF',
|
||||
'LIGHTING_ENERGY_EFF'
|
||||
]
|
||||
|
||||
COMPONENT_FEATURES = CORE_COMPONENT_FEATURES + [
|
||||
"TRANSACTION_TYPE",
|
||||
"ENERGY_TARIFF", # Not sure if this is relevant
|
||||
"EXTENSION_COUNT",
|
||||
"TOTAL_FLOOR_AREA",
|
||||
"FLOOR_HEIGHT",
|
||||
# 'GLAZED_AREA', # May not need this since we have MULTI_GLAZE_PROPORTION
|
||||
]
|
||||
|
||||
POTENTIAL_COLUMNS = [
|
||||
'POTENTIAL_ENERGY_RATING',
|
||||
'POTENTIAL_ENERGY_EFFICIENCY',
|
||||
'ENVIRONMENT_IMPACT_POTENTIAL',
|
||||
'ENERGY_CONSUMPTION_POTENTIAL',
|
||||
'CO2_EMISSIONS_POTENTIAL',
|
||||
# We don't include cost features for the moment
|
||||
# 'LIGHTING_COST_POTENTIAL',
|
||||
# 'HEATING_COST_POTENTIAL',
|
||||
# 'HOT_WATER_COST_POTENTIAL'
|
||||
]
|
||||
|
||||
# For these fields, we take the latest value if we have multiple values
|
||||
# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
|
||||
# the most accurate
|
||||
|
|
@ -253,3 +280,7 @@ ENDING_SUFFIX_COMPONENT_COLS = [
|
|||
'rate_control', 'glazing_type', 'fuel_type', 'main-fuel_tariff_type', 'is_community',
|
||||
'no_individual_heating_or_community_network', 'complex_fuel_type', 'estimated_perimeter'
|
||||
]
|
||||
|
||||
# We found that without performing any filtering, the bottom 0.5% of homes had a floor height of 1.65m. We'll therefore
|
||||
# filter out any homes with a floor height below this
|
||||
MINIMUM_FLOOR_HEIGHT = 1.65
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue