diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py index 0704d64d..93797db0 100644 --- a/etl/customers/stonewater/shdf_3_clustering.py +++ b/etl/customers/stonewater/shdf_3_clustering.py @@ -1992,6 +1992,13 @@ def updated_version(): priority_postcodes, previous_waves_address_id, master_sheet = read_stonewater_asset_data() + # Pull in the EPC data + epc_data = read_epc_data(uprn_lookup_2) + + ######################################################################## + # Prepare the data + ######################################################################## + # Filter the asset list down to the priority postcodes asset_list["is_priority_postcode"] = asset_list["postcode"].isin(priority_postcodes) @@ -2012,8 +2019,17 @@ def updated_version(): right_on="Address ID" ) + asset_list = asset_list.merge( + epc_data[["internal_id", "current-energy-efficiency", "lodgement-date", "estimated"]], + how="left", + on="internal_id" + ) + asset_list["days_since_lodgement_epc"] = ( + datetime.now() - pd.to_datetime(asset_list["lodgement-date"], errors="coerce", dayfirst=True) + ).dt.days + # Flag properties that were surveyed within the last 5 years - asset_list["epc_within_5_years"] = asset_list["days_since_lodgement"] < 5 * 365 + asset_list["epc_within_5_years"] = asset_list["days_since_lodgement_epc"] < 5 * 365 # Identify properties where they've had an EPC done within the last 5 years, where the SAP rating is already # a EPC C. Alternatively, any property with an EPC rating of 80 or above is also considered, regardless of when @@ -2027,9 +2043,9 @@ def updated_version(): asset_list["is_priority_postcode"] & ~asset_list["In Osmosis Wave 2.1"] & ~asset_list["is_epc_c_or_above"] ][ [ - "internal_id", "customer_asset_id", "postcode", "house_number", "address1", "address2", "city_town", - "county", "external_address_id", "owner", "days_since_lodgement", "Lodgement Date", "epc_within_5_years", - "EPC Rating" + "internal_id", "uprn", "udprn", "customer_asset_id", "postcode", "house_number", "address1", "address2", + "city_town", "county", "external_address_id", "owner", "days_since_lodgement", "Lodgement Date", + "epc_within_5_years", "EPC Rating", "estimated", "current-energy-efficiency", "lodgement-date" ] ] @@ -2043,7 +2059,22 @@ def updated_version(): right_on="Address ID" ) - # Pull in the EPC data + # For SAP, we use the most recent EPC if epc_within_5_years is True, otherwise we use the parity modelled sap + clustering_features["current-energy-efficiency"] = clustering_features["current-energy-efficiency"].astype(float) + clustering_features["representative_sap"] = np.where( + clustering_features["epc_within_5_years"], + clustering_features["current-energy-efficiency"], + clustering_features["parity_modelled_sap"] + ) + + # incorect_epcs = clustering_features[ + # clustering_features["EPC Rating"] != clustering_features["current-energy-efficiency"]] + # incorect_epcs = incorect_epcs[ + # ~pd.isnull(incorect_epcs["current-energy-efficiency"]) & pd.isnull(incorect_epcs["estimated"]) + # ] + # incorect_epcs = incorect_epcs.rename(columns={"current-energy-efficiency": "Current SAP Rating"}) + # # Store data + # incorect_epcs.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Stonewater/Incorrect EPCs.csv", index=False) def read_asset_list(): @@ -2260,6 +2291,7 @@ def read_epc_data(uprn_lookup_2): s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl", bucket_name="retrofit-data-dev" ) + epc_data_batch_2 = pd.DataFrame(epc_data_batch_2) complete_epcs = pd.concat([epc_data, epc_data_batch_2])