mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Working on simulation system data
This commit is contained in:
parent
b3614a13d9
commit
44099f8d83
3 changed files with 130 additions and 11 deletions
2
.idea/Model.iml
generated
2
.idea/Model.iml
generated
|
|
@ -7,7 +7,7 @@
|
|||
<sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
|
||||
<sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (backend)" jdkType="Python SDK" />
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (model_data)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
||||
2
.idea/misc.xml
generated
2
.idea/misc.xml
generated
|
|
@ -1,6 +1,6 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (backend)" project-jdk-type="Python SDK" />
|
||||
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (model_data)" project-jdk-type="Python SDK" />
|
||||
<component name="PythonCompatibilityInspectionAdvertiser">
|
||||
<option name="version" value="3" />
|
||||
</component>
|
||||
|
|
|
|||
|
|
@ -49,10 +49,56 @@ COMPONENT_FEATURES = [
|
|||
'EXTENSION_COUNT'
|
||||
]
|
||||
|
||||
# For these fields, we take an average if we have multiple values
|
||||
AVERAGE_FIXED_FEATURES = [
|
||||
"TOTAL_FLOOR_AREA"
|
||||
"TOTAL_FLOOR_AREA",
|
||||
"FLOOR_HEIGHT"
|
||||
]
|
||||
|
||||
# For these fields, we take the latest value if we have multiple values
|
||||
# Since more recent EPCs have been conducted with more rigour, we assume that the latest value is
|
||||
# the most accurate
|
||||
LATEST_FIELD = [
|
||||
"NUMBER_HABITABLE_ROOMS",
|
||||
"NUMBER_HEATED_ROOMS",
|
||||
"FIXED_LIGHTING_OUTLETS_COUNT",
|
||||
"CONSTRUCTION_AGE_BAND"
|
||||
]
|
||||
|
||||
# If we see thee features changing, we don't use the EPC, since deem it not to be reliable
|
||||
MANDATORY_FIXED_FEATURES = [
|
||||
"PROPERTY_TYPE",
|
||||
]
|
||||
|
||||
# For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were
|
||||
# conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England
|
||||
# and Wales from 31 July 2014
|
||||
EARLIEST_EPC_DATE = "2014-08-01"
|
||||
|
||||
RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY"
|
||||
HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT"
|
||||
|
||||
|
||||
def make_cleaning_averages(df):
|
||||
cleaning_averages = df.groupby(
|
||||
["PROPERTY_TYPE", "BUILT_FORM", "CONSTRUCTION_AGE_BAND", "NUMBER_HABITABLE_ROOMS", "NUMBER_HEATED_ROOMS"]
|
||||
)[AVERAGE_FIXED_FEATURES].mean().reset_index()
|
||||
|
||||
return cleaning_averages
|
||||
|
||||
|
||||
FLOOR_LEVEL_MAP = {
|
||||
"00": 0,
|
||||
"01": 1,
|
||||
"02": 2,
|
||||
"03": 3,
|
||||
"Basement": -1,
|
||||
"Ground": 0,
|
||||
"1st": 1,
|
||||
"2nd": 2,
|
||||
"3rd": 3,
|
||||
}
|
||||
|
||||
|
||||
def app():
|
||||
# Get all the files in the directory
|
||||
|
|
@ -63,6 +109,15 @@ def app():
|
|||
filepath = os.path.join(DATA_DIRECTORY, directory, "certificates.csv")
|
||||
df = pd.read_csv(filepath, low_memory=False)
|
||||
df = df[~pd.isnull(df["UPRN"])]
|
||||
|
||||
df = df[df["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE]
|
||||
|
||||
cleaning_averages = make_cleaning_averages(df)
|
||||
|
||||
# We remove EPCS that were conducted for a new build, since these are performed with
|
||||
# full SAP, which produces different results to the RdSAP methodology
|
||||
df = df[df["TRANSACTION_TYPE"] != "new dwelling"]
|
||||
|
||||
df["UPRN"] = df["UPRN"].astype(int).astype(str)
|
||||
counts = df.groupby("UPRN").size().reset_index()
|
||||
counts.columns = ["UPRN", "count"]
|
||||
|
|
@ -73,32 +128,70 @@ def app():
|
|||
df = df[df["UPRN"].isin(counts["UPRN"])]
|
||||
df = df.sort_values(["UPRN", "LODGEMENT_DATE"], ascending=True)
|
||||
|
||||
for uprn, property_data in df.groupby("UPRN"):
|
||||
results = []
|
||||
for uprn, property_data in df.groupby("UPRN", observed=True):
|
||||
|
||||
# Fixed features - these are property attributes that shouldn't change over time
|
||||
|
||||
ignore_epc = False
|
||||
fixed_data = {}
|
||||
for field in FIXED_FEATURES:
|
||||
vals = property_data[field].dropna().unique()
|
||||
# Remove invalid values
|
||||
vals = [v for v in vals if v not in BaseUtility.DATA_ANOMALY_MATCHES]
|
||||
|
||||
if len(vals) > 1:
|
||||
raise ValueError("Fixed feature {} has more than one value - fix me".format(field))
|
||||
if field == "FLOOR_LEVEL":
|
||||
vals = list({FLOOR_LEVEL_MAP[v] for v in vals})
|
||||
|
||||
if field in AVERAGE_FIXED_FEATURES:
|
||||
# Check the values are too far apart
|
||||
if abs(vals[0] - vals[1]) / vals[0] > 0.1:
|
||||
raise ValueError("Large deviation in fixed feature {} - fix me".format(field))
|
||||
|
||||
field_value = np.mean(vals)
|
||||
if len(vals) > 1:
|
||||
# Check the values are too far apart
|
||||
if abs(vals[0] - vals[1]) / vals[0] > 0.1:
|
||||
# Take the more recent value since it's likely to be more accurate
|
||||
vals = [vals[-1]]
|
||||
|
||||
if vals:
|
||||
field_value = np.mean(vals)
|
||||
else:
|
||||
# Clean using averages
|
||||
avgs = cleaning_averages[
|
||||
(cleaning_averages["PROPERTY_TYPE"] == property_data["PROPERTY_TYPE"].iloc[0]) &
|
||||
(cleaning_averages["BUILT_FORM"] == property_data["BUILT_FORM"].iloc[0]) &
|
||||
(cleaning_averages["CONSTRUCTION_AGE_BAND"] == property_data["CONSTRUCTION_AGE_BAND"].iloc[
|
||||
0]) &
|
||||
(cleaning_averages["NUMBER_HABITABLE_ROOMS"] ==
|
||||
property_data["NUMBER_HABITABLE_ROOMS"].iloc[0]) &
|
||||
(cleaning_averages["NUMBER_HEATED_ROOMS"] == property_data["NUMBER_HEATED_ROOMS"].iloc[0])
|
||||
]
|
||||
|
||||
field_value = avgs[field].iloc[0]
|
||||
|
||||
elif field in LATEST_FIELD:
|
||||
field_value = vals[-1] if vals else None
|
||||
else:
|
||||
if len(vals) > 1:
|
||||
if field in MANDATORY_FIXED_FEATURES:
|
||||
ignore_epc = True
|
||||
else:
|
||||
raise ValueError("Fixed feature {} has more than one value - fix me".format(field))
|
||||
|
||||
field_value = vals[0] if vals else None
|
||||
|
||||
fixed_data[field] = field_value
|
||||
|
||||
variable_data = property_data[COMPONENT_FEATURES]
|
||||
if ignore_epc:
|
||||
continue
|
||||
|
||||
# We include the lodgement date here as we probably need to factor time into the
|
||||
# model, since EPC standards and rigour have changed over time
|
||||
variable_data = property_data[
|
||||
COMPONENT_FEATURES + ["LODGEMENT_DATE", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE]
|
||||
]
|
||||
|
||||
# Note: we look at changes between subsequent EPCS, however we could look at other permutations
|
||||
# e.g. first vs second, second vs third and also first vs third
|
||||
property_model_data = []
|
||||
for idx in range(0, property_data.shape[0] - 1):
|
||||
|
||||
if idx >= property_data.shape[0] - 1:
|
||||
|
|
@ -106,3 +199,29 @@ def app():
|
|||
|
||||
starting_record = variable_data.iloc[idx]
|
||||
ending_record = variable_data.iloc[idx + 1]
|
||||
rdsap_change = ending_record[RDSAP_RESPONSE] - starting_record[RDSAP_RESPONSE]
|
||||
heat_demand_change = ending_record[HEAT_DEMAND_RESPONSE] - starting_record[HEAT_DEMAND_RESPONSE]
|
||||
|
||||
if rdsap_change == 0:
|
||||
# Assumption: We aren't interested in records that exhibit no change
|
||||
continue
|
||||
|
||||
# TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and
|
||||
# floors, we may want to use the U-value. We may also want to handle the (assumed) tags
|
||||
# within descriptions
|
||||
|
||||
starting_record = starting_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_STARTING")
|
||||
ending_record = ending_record[COMPONENT_FEATURES + ["LODGEMENT_DATE"]].add_suffix("_ENDING")
|
||||
|
||||
features = pd.concat([starting_record, ending_record])
|
||||
|
||||
property_model_data.append(
|
||||
{
|
||||
"UPRN": uprn,
|
||||
"RDSAP_CHANGE": rdsap_change,
|
||||
"HEAT_DEMAND_CHANGE": heat_demand_change,
|
||||
**features.to_dict()
|
||||
}
|
||||
)
|
||||
|
||||
results.extend(property_model_data)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue