mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
Stonewater WIP
This commit is contained in:
parent
3977e911ec
commit
603b3e1db2
3 changed files with 200 additions and 1 deletions
|
|
@ -133,7 +133,7 @@ def app():
|
|||
energy_consumption_data = []
|
||||
for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
|
||||
# Skip the first 50
|
||||
if i < 245:
|
||||
if i < 250:
|
||||
continue
|
||||
|
||||
data = pd.read_csv(directory / "certificates.csv", low_memory=False)
|
||||
|
|
|
|||
|
|
@ -1995,6 +1995,29 @@ def updated_version():
|
|||
# Pull in the EPC data
|
||||
epc_data = read_epc_data(uprn_lookup_2)
|
||||
|
||||
# Pull in the spatial data to UPRN
|
||||
spatial_data_to_uprn = read_pickle_from_s3(
|
||||
s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
|
||||
bucket_name="retrofit-data-dev"
|
||||
)
|
||||
|
||||
# Function to convert specific columns to bool dtype
|
||||
def convert_specific_columns_to_bool(df, columns):
|
||||
for column in columns:
|
||||
if column in df.columns:
|
||||
df[column] = df[column].astype(bool)
|
||||
return df
|
||||
|
||||
spatial_data_to_uprn = [convert_specific_columns_to_bool(
|
||||
df, ['conservation_status', 'is_listed_building', 'is_heritage_building']
|
||||
) for df in spatial_data_to_uprn]
|
||||
|
||||
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
|
||||
spatial_data_to_uprn = spatial_data_to_uprn.drop(
|
||||
columns=["partition", "filename"]
|
||||
).rename(columns={"UPRN": "uprn"})
|
||||
spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str)
|
||||
|
||||
########################################################################
|
||||
# Prepare the data
|
||||
########################################################################
|
||||
|
|
@ -2067,6 +2090,17 @@ def updated_version():
|
|||
clustering_features["parity_modelled_sap"]
|
||||
)
|
||||
|
||||
# We remove the final three entries from postcode to give us postal region. Removing two gives us 415 values which
|
||||
# is too many
|
||||
clustering_features["postal_region"] = clustering_features["postcode"].str[:-3]
|
||||
|
||||
# Merge on spatial features
|
||||
clustering_features = clustering_features.merge(
|
||||
spatial_data_to_uprn[["uprn", "conservation_status", "is_listed_building", "is_heritage_building"]],
|
||||
how="left",
|
||||
on="uprn"
|
||||
)
|
||||
|
||||
# incorect_epcs = clustering_features[
|
||||
# clustering_features["EPC Rating"] != clustering_features["current-energy-efficiency"]]
|
||||
# incorect_epcs = incorect_epcs[
|
||||
|
|
@ -2076,6 +2110,168 @@ def updated_version():
|
|||
# # Store data
|
||||
# incorect_epcs.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Incorrect EPCs.csv", index=False)
|
||||
|
||||
# We add in the key features, which are used for clustering
|
||||
master_sheet_clustering_features = master_sheet[
|
||||
["Address ID", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Total Floor Area"]
|
||||
].copy()
|
||||
|
||||
# Step 1: Remap walls - we end up with 11 types
|
||||
master_sheet_clustering_features["walls_reduced"] = master_sheet_clustering_features["Walls"].replace(
|
||||
{
|
||||
"TimberFrame: AsBuilt": "Other wall type, as built",
|
||||
"SystemBuilt: AsBuilt": "Other wall type, as built",
|
||||
"Sandstone: AsBuilt": "Other wall type, as built",
|
||||
"Sandstone: Internal": "Other wall type, internal or external",
|
||||
"SystemBuilt: External": "Other wall type, internal or external",
|
||||
"GraniteOrWhinstone: AsBuilt": "Other wall type, as built",
|
||||
"TimberFrame: Internal": "Other wall type, internal or external",
|
||||
"Cavity: FilledCavityPlusInternal": "Cavity: FilledCavity",
|
||||
"SystemBuilt: Internal": "Other wall type, internal or external",
|
||||
"Cavity: Internal": "Other wall type, internal or external",
|
||||
}
|
||||
)
|
||||
|
||||
# Step 2: Remap roofs - we split on the : where the first part of the string gives us the roof type, the second
|
||||
# gives us the insulation thickness
|
||||
|
||||
# Clean an incorrect value
|
||||
master_sheet_clustering_features["Roofs"] = master_sheet_clustering_features["Roofs"].replace(
|
||||
{
|
||||
"PitchedWithSlopingCeiling: mm250": "PitchedWithSlopingCeiling: 250mm",
|
||||
"PitchedWithSlopingCeiling: 150mm+": "PitchedWithSlopingCeiling: 150mm",
|
||||
'PitchedWithSlopingCeiling: mm25': "PitchedWithSlopingCeiling: 25mm",
|
||||
'PitchedWithSlopingCeiling: mm200': "PitchedWithSlopingCeiling: 200mm",
|
||||
'AnotherDwellingAbove: 50mm': 'PitchedNormalLoftAccess: 50mm',
|
||||
}
|
||||
)
|
||||
|
||||
master_sheet_clustering_features[['roof_type', 'roof_insulation_thickness']] = (
|
||||
master_sheet_clustering_features['Roofs'].apply(
|
||||
lambda x: pd.Series(x.split(':', 1) if ':' in x else [x, ''])
|
||||
)
|
||||
)
|
||||
|
||||
# Strip any extra whitespace
|
||||
master_sheet_clustering_features['roof_type'] = master_sheet_clustering_features['roof_type'].str.strip()
|
||||
master_sheet_clustering_features['roof_insulation_thickness'] = (
|
||||
master_sheet_clustering_features['roof_insulation_thickness'].str.strip()
|
||||
)
|
||||
|
||||
def map_thickness(thickness):
|
||||
try:
|
||||
value = float(thickness.replace('mm', '').replace('+', '').replace(' ', ''))
|
||||
return "Above 250mm" if value > 250 else "Below 250mm"
|
||||
except ValueError:
|
||||
return thickness # Return the original value if it cannot be converted to a float
|
||||
|
||||
master_sheet_clustering_features['roof_insulation_category'] = (
|
||||
master_sheet_clustering_features['roof_insulation_thickness'].apply(map_thickness)
|
||||
)
|
||||
|
||||
# Ideas
|
||||
# 1) We might need to remap the roof type to pitched, flat or another dwelling above and then have the access
|
||||
# as a secondary category
|
||||
# 2) Split out the (community) tag in the fuel as a secondary feature, which isn't strictly split
|
||||
# (could split on :, take first part)
|
||||
|
||||
clustering_features = clustering_features.merge(
|
||||
master_sheet_clustering_features,
|
||||
how="left",
|
||||
on="Address ID"
|
||||
)
|
||||
|
||||
# Reduce down to the final set of features we need
|
||||
clustering_features = clustering_features[
|
||||
[
|
||||
"internal_id",
|
||||
"Property Type",
|
||||
# Location
|
||||
"postal_region",
|
||||
'conservation_status',
|
||||
'is_listed_building',
|
||||
'is_heritage_building',
|
||||
# Walls
|
||||
"walls_reduced",
|
||||
# Roof
|
||||
"roof_type",
|
||||
"roof_insulation_category",
|
||||
# Heating
|
||||
"Heating",
|
||||
# Fuel
|
||||
"Main Fuel",
|
||||
"Age",
|
||||
"Total Floor Area"
|
||||
"representative_sap",
|
||||
]
|
||||
]
|
||||
|
||||
z = master_sheet_clustering_features[
|
||||
["Property Type", "walls_reduced", "roof_type", "roof_insulation_category", "Main Fuel", "Age"]
|
||||
].drop_duplicates()
|
||||
|
||||
# TODO: heating - remap
|
||||
# Boiler: A rated Regular Boiler
|
||||
# 1944
|
||||
# Boiler: A rated Combi
|
||||
# 1335
|
||||
# Electric Storage Systems: High heat retention storage heaters
|
||||
# 543
|
||||
# Electric Storage Systems: Fan storage heaters
|
||||
# 284
|
||||
# Electric (direct acting) room heaters: Panel, convector or radiant heaters
|
||||
# 253
|
||||
# Boiler: C rated Regular Boiler
|
||||
# 142
|
||||
# Community Heating Systems: Community boilers only (RdSAP)
|
||||
# 127
|
||||
# Heat Pump: Electric Heat pumps: Air source heat pump with flow temperature <= 35°C
|
||||
# 126
|
||||
# Heat Pump: Electric Heat pumps: Ground source heat pump with flow temperature <= 35°C
|
||||
# 70
|
||||
# Boiler: E rated Regular Boiler
|
||||
# 62
|
||||
# Boiler: E rated Combi
|
||||
# 59
|
||||
# Electric Storage Systems: Old (large volume) storage heaters
|
||||
# 55
|
||||
# Electric Storage Systems: Modern (slimline) storage heaters
|
||||
# 49
|
||||
# Boiler: B rated Regular Boiler
|
||||
# 46
|
||||
# Boiler: C rated Combi
|
||||
# 44
|
||||
# Heat Pump: Electric Heat pumps: Ground source heat pump in other cases
|
||||
# 39
|
||||
# Boiler: D rated Regular Boiler
|
||||
# 16
|
||||
# Community Heating Systems: Community CHP and boilers (RdSAP)
|
||||
# 14
|
||||
# Heat Pump: Electric Heat pumps: Air source heat pump in other cases
|
||||
# 13
|
||||
# Boiler: F rated Combi
|
||||
# 12
|
||||
# Boiler: G rated Regular Boiler
|
||||
# 10
|
||||
# Boiler: A rated Combi, System 2: Electric Storage Systems: High heat retention storage heaters
|
||||
# 8
|
||||
# Electric (direct acting) room heaters: Water- or oil-filled radiators
|
||||
# 4
|
||||
# Boiler: A rated Combi, System 2: Electric (direct acting) room heaters: Panel, convector or radiant heaters
|
||||
# 3
|
||||
# Boiler: D rated Combi
|
||||
# 3
|
||||
# Boiler: A rated CPSU
|
||||
# 2
|
||||
# Heat Pump: (from database)
|
||||
# 1
|
||||
# System 2: Boiler: G rated Regular Boiler, Boiler: A rated Combi
|
||||
# 1
|
||||
# No Heating
|
||||
# 1
|
||||
# Solid fuel room heaters: Open fire in grate
|
||||
# 1
|
||||
# Boiler: F rated Regular Boiler
|
||||
|
||||
|
||||
def read_asset_list():
|
||||
asset_list = pd.read_excel(
|
||||
|
|
|
|||
|
|
@ -75,3 +75,6 @@ def app():
|
|||
agg.insert(0, "Address", address1)
|
||||
|
||||
collected_data.append(agg)
|
||||
|
||||
collected_df = pd.concat(collected_data)
|
||||
collected_df.to_csv("EPC Averages SFR.csv", index=False)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue