refactoring creation of epc dataset

This commit is contained in:
Khalim Conn-Kowlessar 2025-02-20 08:39:29 +00:00
parent 4a6802a5a2
commit 37cc43adb1
2 changed files with 59 additions and 60 deletions

View file

@ -202,6 +202,33 @@ class AssetList:
This class is used to standardise asset lists so that we can process the core information in a consistent manner.
"""
EPC_API_DATA_NAMES = {
"uprn": "epc_os_uprn",
"address1": "epc_address1",
"address": "epc_address",
"postcode": "epc_postcode",
"inspection-date": "epc_inspection_date",
"current-energy-efficiency": "epc_sap_score_on_register",
"current-energy-rating": "epc_rating_on_register",
"property-type": "epc_property_type",
"built-form": "epc_archetype",
"total-floor-area": "epc_total_floor_area",
"construction-age-band": "epc_age_band",
"floor-height": "epc_floor_height",
"number-habitable-rooms": "epc_number_habitable_rooms",
"walls-description": "epc_wall_construction",
"roof-description": "epc_roof_construction",
"floor-description": "epc_floor_construction",
"mainheat-description": "epc_heating_type",
'mainheatcont-description': "epc_heating_controls",
"secondheat-description": "epc_secondary_heating",
"transaction-type": "epc_reason",
"energy-consumption-current": "epc_heat_demand",
}
FIND_EPC_DATA_NAMES = {
}
DATETIME_REMAP = {
"Pre 1900": datetime(year=1899, month=12, day=31),
}
@ -590,3 +617,18 @@ class AssetList:
def create_lookup_mappings(self):
pass
def merge_data(self, df: pd.DataFrame):
"""
Used to insert data into the standardised asset list, based on the domna property id
:return:
"""
if self.DOMNA_PROPERTY_ID not in df.columns:
raise ValueError(f"Dataframe must contain the column {self.DOMNA_PROPERTY_ID}")
if df[self.DOMNA_PROPERTY_ID].duplicated().sum():
raise ValueError(f"{self.DOMNA_PROPERTY_ID} contains duplicated IDs")
self.standardised_asset_list = self.standardised_asset_list.merge(
df, how="left", on=self.DOMNA_PROPERTY_ID
)

View file

@ -474,20 +474,22 @@ def app():
epc_data.append(csv_data)
epc_df = pd.concat(epc_data)
# TODO: TEMP!!!
epc_df = epc_df.rename(columns={"row_id": asset_list.DOMNA_PROPERTY_ID})
# We expand out the recommendations
recommendations_df = epc_df[["row_id", "recommendations"]]
recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]]
unique_recommendations = set()
for _, row in recommendations_df.iterrows():
unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]])
columns = ["row_id"] + list(unique_recommendations)
columns = [asset_list.DOMNA_PROPERTY_ID] + list(unique_recommendations)
transformed_data = []
for _, row in recommendations_df.iterrows():
# Initialize a dictionary for this row with False for all recommendations
row_data = {col: False for col in columns}
row_data["row_id"] = row["row_id"]
row_data[asset_list.DOMNA_PROPERTY_ID] = row[asset_list.DOMNA_PROPERTY_ID]
# Set True for each recommendation present in this row
for rec in row["recommendations"]:
@ -500,10 +502,11 @@ def app():
transformed_df = pd.DataFrame(transformed_data)
# At the moment, we're only using a limited set of columns - let's jut keep cavity wall insulation
# recommendations
transformed_df = transformed_df[["row_id", "Cavity wall insulation"]]
transformed_df = transformed_df[[asset_list.DOMNA_PROPERTY_ID, "Cavity wall insulation"]]
# Get the find my epc data
find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join(
find_my_epc_data = epc_df[[asset_list.DOMNA_PROPERTY_ID, "find_my_epc_data"]].drop(
columns=["find_my_epc_data"]).join(
pd.json_normalize(epc_df["find_my_epc_data"])
)
# We check if we get the solar pv column:
@ -513,46 +516,15 @@ def app():
# Retrieve just the data we need
epc_df = epc_df[
[
"row_id",
"uprn",
"address1",
"address",
"postcode",
"property-type",
"built-form",
"inspection-date",
"current-energy-rating",
"current-energy-efficiency",
"roof-description",
"walls-description",
"floor-description",
"transaction-type",
"secondheat-description",
"total-floor-area",
"construction-age-band",
"floor-height",
"number-habitable-rooms",
"mainheat-description",
'mainheatcont-description',
"energy-consumption-current",
"photo-supply",
]
].rename(
columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"}
[asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys())
].rename(
columns=asset_list.EPC_API_DATA_NAMES
)
asset_list.merge_data(epc_df)
asset_list.insert_
asset_list = asset_list.merge(
epc_df,
how="left",
on="row_id"
).merge(
epc_df = epc_df.merge(
find_my_epc_data[
[
"row_id", "heating_text", "hot_water_text", 'Assessors name',
asset_list.DOMNA_PROPERTY_ID, "heating_text", "hot_water_text", 'Assessors name',
"Assessor's Telephone", "Assessor's Email", "Accreditation scheme",
"Assessors ID", "Solar photovoltaics"
]
@ -564,31 +536,16 @@ def app():
}
),
how="left",
on="row_id"
on=asset_list.DOMNA_PROPERTY_ID
)
asset_list.merge_data(epc_df)
asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""])
asset_list = asset_list.drop(columns=["photo-supply"])
# Rename the columns
asset_list = asset_list.rename(columns={
"inspection-date": "Date of last EPC",
"current-energy-efficiency": "SAP score on register",
"current-energy-rating": "EPC rating on register",
"property-type": "Property Type",
"built-form": "Archetype - EPC",
"total-floor-area": "Property Floor Area",
"construction-age-band": "Property Age Band",
"floor-height": "Property Floor Height",
"number-habitable-rooms": "Number of Habitable Rooms",
"walls-description": "Wall Construction",
"roof-description": "Roof Construction",
"floor-description": "Floor Construction",
"mainheat-description": "Heating Type",
"secondheat-description": "Secondary Heating",
"transaction-type": "Reason for last EPC",
"energy-consumption-current": "Heat Demand (kWh/m2)",
})
asset_list = asset_list
asset_list["Estimated Number of Floors"] = asset_list.apply(
lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull(