Stonewater WIP

This commit is contained in:
Khalim Conn-Kowlessar 2024-07-23 17:07:22 +01:00
parent 3977e911ec
commit 603b3e1db2
3 changed files with 200 additions and 1 deletions

View file

@ -133,7 +133,7 @@ def app():
energy_consumption_data = []
for i, directory in tqdm(enumerate(epc_directories), total=len(epc_directories)):
# Skip the first 50
if i < 245:
if i < 250:
continue
data = pd.read_csv(directory / "certificates.csv", low_memory=False)

View file

@ -1995,6 +1995,29 @@ def updated_version():
# Pull in the EPC data
epc_data = read_epc_data(uprn_lookup_2)
# Pull in the spatial data to UPRN
spatial_data_to_uprn = read_pickle_from_s3(
s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
bucket_name="retrofit-data-dev"
)
# Function to convert specific columns to bool dtype
def convert_specific_columns_to_bool(df, columns):
for column in columns:
if column in df.columns:
df[column] = df[column].astype(bool)
return df
spatial_data_to_uprn = [convert_specific_columns_to_bool(
df, ['conservation_status', 'is_listed_building', 'is_heritage_building']
) for df in spatial_data_to_uprn]
spatial_data_to_uprn = pd.concat(spatial_data_to_uprn)
spatial_data_to_uprn = spatial_data_to_uprn.drop(
columns=["partition", "filename"]
).rename(columns={"UPRN": "uprn"})
spatial_data_to_uprn["uprn"] = spatial_data_to_uprn["uprn"].astype(str)
########################################################################
# Prepare the data
########################################################################
@ -2067,6 +2090,17 @@ def updated_version():
clustering_features["parity_modelled_sap"]
)
# We remove the final three entries from postcode to give us postal region. Removing two gives us 415 values which
# is too many
clustering_features["postal_region"] = clustering_features["postcode"].str[:-3]
# Merge on spatial features
clustering_features = clustering_features.merge(
spatial_data_to_uprn[["uprn", "conservation_status", "is_listed_building", "is_heritage_building"]],
how="left",
on="uprn"
)
# incorect_epcs = clustering_features[
# clustering_features["EPC Rating"] != clustering_features["current-energy-efficiency"]]
# incorect_epcs = incorect_epcs[
@ -2076,6 +2110,168 @@ def updated_version():
# # Store data
# incorect_epcs.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Incorrect EPCs.csv", index=False)
# We add in the key features, which are used for clustering
master_sheet_clustering_features = master_sheet[
["Address ID", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Total Floor Area"]
].copy()
# Step 1: Remap walls - we end up with 11 types
master_sheet_clustering_features["walls_reduced"] = master_sheet_clustering_features["Walls"].replace(
{
"TimberFrame: AsBuilt": "Other wall type, as built",
"SystemBuilt: AsBuilt": "Other wall type, as built",
"Sandstone: AsBuilt": "Other wall type, as built",
"Sandstone: Internal": "Other wall type, internal or external",
"SystemBuilt: External": "Other wall type, internal or external",
"GraniteOrWhinstone: AsBuilt": "Other wall type, as built",
"TimberFrame: Internal": "Other wall type, internal or external",
"Cavity: FilledCavityPlusInternal": "Cavity: FilledCavity",
"SystemBuilt: Internal": "Other wall type, internal or external",
"Cavity: Internal": "Other wall type, internal or external",
}
)
# Step 2: Remap roofs - we split on the : where the first part of the string gives us the roof type, the second
# gives us the insulation thickness
# Clean an incorrect value
master_sheet_clustering_features["Roofs"] = master_sheet_clustering_features["Roofs"].replace(
{
"PitchedWithSlopingCeiling: mm250": "PitchedWithSlopingCeiling: 250mm",
"PitchedWithSlopingCeiling: 150mm+": "PitchedWithSlopingCeiling: 150mm",
'PitchedWithSlopingCeiling: mm25': "PitchedWithSlopingCeiling: 25mm",
'PitchedWithSlopingCeiling: mm200': "PitchedWithSlopingCeiling: 200mm",
'AnotherDwellingAbove: 50mm': 'PitchedNormalLoftAccess: 50mm',
}
)
master_sheet_clustering_features[['roof_type', 'roof_insulation_thickness']] = (
master_sheet_clustering_features['Roofs'].apply(
lambda x: pd.Series(x.split(':', 1) if ':' in x else [x, ''])
)
)
# Strip any extra whitespace
master_sheet_clustering_features['roof_type'] = master_sheet_clustering_features['roof_type'].str.strip()
master_sheet_clustering_features['roof_insulation_thickness'] = (
master_sheet_clustering_features['roof_insulation_thickness'].str.strip()
)
def map_thickness(thickness):
try:
value = float(thickness.replace('mm', '').replace('+', '').replace(' ', ''))
return "Above 250mm" if value > 250 else "Below 250mm"
except ValueError:
return thickness # Return the original value if it cannot be converted to a float
master_sheet_clustering_features['roof_insulation_category'] = (
master_sheet_clustering_features['roof_insulation_thickness'].apply(map_thickness)
)
# Ideas
# 1) We might need to remap the roof type to pitched, flat or another dwelling above and then have the access
# as a secondary category
# 2) Split out the (community) tag in the fuel as a secondary feature, which isn't strictly split
# (could split on :, take first part)
clustering_features = clustering_features.merge(
master_sheet_clustering_features,
how="left",
on="Address ID"
)
# Reduce down to the final set of features we need
clustering_features = clustering_features[
[
"internal_id",
"Property Type",
# Location
"postal_region",
'conservation_status',
'is_listed_building',
'is_heritage_building',
# Walls
"walls_reduced",
# Roof
"roof_type",
"roof_insulation_category",
# Heating
"Heating",
# Fuel
"Main Fuel",
"Age",
"Total Floor Area"
"representative_sap",
]
]
z = master_sheet_clustering_features[
["Property Type", "walls_reduced", "roof_type", "roof_insulation_category", "Main Fuel", "Age"]
].drop_duplicates()
# TODO: heating - remap
# Boiler: A rated Regular Boiler
# 1944
# Boiler: A rated Combi
# 1335
# Electric Storage Systems: High heat retention storage heaters
# 543
# Electric Storage Systems: Fan storage heaters
# 284
# Electric (direct acting) room heaters: Panel, convector or radiant heaters
# 253
# Boiler: C rated Regular Boiler
# 142
# Community Heating Systems: Community boilers only (RdSAP)
# 127
# Heat Pump: Electric Heat pumps: Air source heat pump with flow temperature <= 35°C
# 126
# Heat Pump: Electric Heat pumps: Ground source heat pump with flow temperature <= 35°C
# 70
# Boiler: E rated Regular Boiler
# 62
# Boiler: E rated Combi
# 59
# Electric Storage Systems: Old (large volume) storage heaters
# 55
# Electric Storage Systems: Modern (slimline) storage heaters
# 49
# Boiler: B rated Regular Boiler
# 46
# Boiler: C rated Combi
# 44
# Heat Pump: Electric Heat pumps: Ground source heat pump in other cases
# 39
# Boiler: D rated Regular Boiler
# 16
# Community Heating Systems: Community CHP and boilers (RdSAP)
# 14
# Heat Pump: Electric Heat pumps: Air source heat pump in other cases
# 13
# Boiler: F rated Combi
# 12
# Boiler: G rated Regular Boiler
# 10
# Boiler: A rated Combi, System 2: Electric Storage Systems: High heat retention storage heaters
# 8
# Electric (direct acting) room heaters: Water- or oil-filled radiators
# 4
# Boiler: A rated Combi, System 2: Electric (direct acting) room heaters: Panel, convector or radiant heaters
# 3
# Boiler: D rated Combi
# 3
# Boiler: A rated CPSU
# 2
# Heat Pump: (from database)
# 1
# System 2: Boiler: G rated Regular Boiler, Boiler: A rated Combi
# 1
# No Heating
# 1
# Solid fuel room heaters: Open fire in grate
# 1
# Boiler: F rated Regular Boiler
def read_asset_list():
asset_list = pd.read_excel(

View file

@ -75,3 +75,6 @@ def app():
agg.insert(0, "Address", address1)
collected_data.append(agg)
collected_df = pd.concat(collected_data)
collected_df.to_csv("EPC Averages SFR.csv", index=False)