diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index c397f962..3b44d560 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1635,8 +1635,9 @@ def propsed_wave_3_sample(): header=4 ) - # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater - asset_list = asset_list[asset_list["Archetype ID"] != "NOT PRIORITY POSTCODE"] + # TODO: We drop 302 properties that are not priority postcodes - confirm w/ Stonewater and 7 properties missing + # UPRN + asset_list = asset_list[~asset_list["Archetype ID"].isin(["NOT PRIORITY POSTCODE", "MISSING UPRN"])] # Clean address ids asset_list = asset_list[~pd.isnull(asset_list["Address ID"])] asset_list = asset_list[asset_list["Address ID"] != "Address ID"] @@ -1648,7 +1649,7 @@ def propsed_wave_3_sample(): # Keep just the columns we need asset_list = asset_list[ - ["Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type", + ["UPRN", "Address ID", "Archetype ID", "Postal Region", "Postcode", "Property Type", "Wall Type", "Roof Type", "Heating"] ] @@ -1665,7 +1666,7 @@ def propsed_wave_3_sample(): survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0] survey_results_with_original_features = survey_results.merge( - asset_list[["Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]], + asset_list[["UPRN", "Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]], on="Address ID", how="left" ) @@ -1673,6 +1674,45 @@ def propsed_wave_3_sample(): if survey_results_with_original_features.shape[0] != survey_results.shape[0]: raise ValueError("Something went wrong") + # We get longitude & Latitude + from utils.s3 import read_pickle_from_s3 + archetyping_spatial_features = read_pickle_from_s3( + bucket_name="retrofit-data-dev", s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl", + ) + archetyping_spatial_features = pd.concat(archetyping_spatial_features) + archetyping_spatial_features = archetyping_spatial_features[["UPRN", 'LATITUDE', 'LONGITUDE']].rename( + columns={"LATITUDE": "latitude", "LONGITUDE": "longitude"} + ) + # Merge them onto both datasets + asset_list = asset_list.merge( + archetyping_spatial_features, how="left", on="UPRN" + ) + if pd.isnull(asset_list["longitude"]).sum(): + raise ValueError("Something went wrong") + + survey_results_with_original_features = survey_results_with_original_features.merge( + archetyping_spatial_features, how="left", on="UPRN" + ) + if pd.isnull(survey_results_with_original_features["longitude"]).sum(): + raise ValueError("Something went wrong") + + def haversine(lat1, lon1, lat2, lon2): + # Radius of Earth in meters + R = 6371000 + + # Convert degrees to radians + lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2]) + + # Differences + dlat = lat2 - lat1 + dlon = lon2 - lon1 + + # Haversine formula + a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2 + c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)) + distance = R * c + return distance + # Tier definitions # Tier 1: We have a property in the same postal region and same archetype that was surveyed and is below EPC D # Tier 2: We have a property in the same archetype that was surveyed and is below EPC D @@ -1716,6 +1756,7 @@ def propsed_wave_3_sample(): ][["Archetype ID", "Current EPC Band"]].drop_duplicates() if region_surveyed["Archetype ID"].duplicated().sum(): + blah1 region_surveyed = survey_results[ survey_results["Archetype ID"].isin(archetypes) & (survey_results["Postal Region"] == region) @@ -1755,23 +1796,46 @@ def propsed_wave_3_sample(): survey_results["Archetype ID"].isin(missed_archetypes) ][["Archetype ID", "Current EPC Band"]].drop_duplicates() - # TODO - We could average the property?? And call it borderline, call out it was averaged!!! - # We could also find the nearest property to it, with similar wall, roof, heating? - # Can use long/lag to distance calc. We have this data from previous - if archetype_surveyed["Archetype ID"].duplicated().sum(): - archetype_surveyed = survey_results[ - survey_results["Archetype ID"].isin(missed_archetypes) - ].groupby("Archetype ID")[["Current SAP Rating"]].mean().reset_index() - archetype_surveyed["Current EPC Band"] = archetype_surveyed["Current SAP Rating"].apply(sap_to_epc) - archetype_surveyed = archetype_surveyed.drop(columns=["Current SAP Rating"]) - region_assets = region_assets.merge( - archetype_surveyed, - on="Archetype ID", - how="left", - suffixes=("", "_method2") - ) + archetype_surveyed = [] + for arch_id in missed_archetypes: + for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): + archetype_data = survey_results_with_original_features[ + survey_results["Archetype ID"] == arch_id + ].copy() + if archetype_data.empty: + continue + archetype_data["distance_meters"] = haversine( + lat1=property.latitude, lon1=property.longitude, + lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values + ) + expected_sap = np.average( + archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) + ) + expected_epc = sap_to_epc(expected_sap) + archetype_surveyed.append( + { + "Archetype ID": arch_id, + "Address ID": property["Address ID"], + "Current EPC Band": expected_epc + } + ) + archetype_surveyed = pd.DataFrame(archetype_surveyed) + region_assets = region_assets.merge( + archetype_surveyed, + on=["Archetype ID", "Address ID"], + how="left", + suffixes=("", "_method2") + ) + else: + + region_assets = region_assets.merge( + archetype_surveyed, + on="Archetype ID", + how="left", + suffixes=("", "_method2") + ) region_assets["Confidence Tier"] = np.where( region_assets["Current EPC Band_method2"].isin(["D", "E", "F", "G"]) & pd.isnull( @@ -1792,6 +1856,16 @@ def propsed_wave_3_sample(): "5 - EPC C or above", region_assets["Confidence Tier"] ) + region_assets["Confidence Tier"] = np.where( + region_assets["Archetype ID"] == "EPC C OR ABOVE", + "5 - EPC C or above", region_assets["Confidence Tier"] + ) + + region_assets["Current EPC Band"] = np.where( + region_assets["Archetype ID"] == "EPC C OR ABOVE", + "C", region_assets["Current EPC Band"] + ) + missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() if not missed_addressids: @@ -1803,17 +1877,10 @@ def propsed_wave_3_sample(): for a_id in missed_addressids: property = asset_list[asset_list["Address ID"] == a_id].squeeze() - if property["Property Type"].split(":")[0] in ["House", "Bungalow"]: - filter_property_types = ["House", "Bungalow"] - else: - filter_property_types = ["Flat"] - - surveyed_similar = survey_results_with_original_features[ - (survey_results_with_original_features["Postcode"] == property["Postcode"]) & + surveyed = survey_results_with_original_features[ ( - survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( - filter_property_types - ) + survey_results_with_original_features["Property Type"].str.split(":").str[0] == + property["Property Type"].split(":")[0] ) & ( survey_results_with_original_features["Wall Type"].str.split(":").str[0] == @@ -1827,62 +1894,38 @@ def propsed_wave_3_sample(): survey_results_with_original_features["Heating"].str.split(":").str[0] == property["Heating"].split(":")[0] ) - ] - if surveyed_similar.empty: - surveyed_similar = survey_results_with_original_features[ - (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & - (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( - filter_property_types - )) & - (survey_results_with_original_features["Wall Type"].str.split(":").str[0] == - property["Wall Type"].split(":")[0]) & - (survey_results_with_original_features["Roof Type"].str.split(":").str[0] == - property["Roof Type"].split(":")[0]) & - (survey_results_with_original_features["Heating"].str.split(":").str[0] == - property["Heating"].split(":")[0]) - ] + ].copy() - if surveyed_similar.empty: + if surveyed.empty: + blah3 - # We get an average based on the postcode - surveyed_similar = survey_results_with_original_features[ - (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & - (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( - filter_property_types - )) - ] - if surveyed_similar.empty: - final_missed_matches.append( - { - "Address ID": a_id, - "Confidence Tier": "4 - no similar property, needs survey to confirm", - "Current EPC Band": "Unknown" - } + # Calculate distance + surveyed["distance_meters"] = haversine( + lat1=property["latitude"], lon1=property["longitude"], + lat2=surveyed["latitude"].values, lon2=surveyed["longitude"].values + ) + surveyed = surveyed.sort_values("distance_meters", ascending=True) - ) - else: - expected_sap = surveyed_similar["Current SAP Rating"].mean() - expected_epc = sap_to_epc(expected_sap) - if expected_epc in ["C", "B", "A"]: - tier = "5 - EPC C or above" - else: - tier = "3 - similar property, relaxed conditions" + # Check if we have a postcode match check if surveyed postcode is the same as the property postcode + if any(surveyed["Postcode"] == property["Postcode"]): + surveyed_similar = surveyed[surveyed["Postcode"] == property["Postcode"]] - final_missed_matches.append( - { - "Address ID": a_id, - "Confidence Tier": tier, - "Current EPC Band": expected_epc - } - ) - continue - # We take an average - expected_sap = surveyed_similar["Current SAP Rating"].mean() + if any(surveyed["Postal Region"] == property["Postal Region"]): + surveyed_similar = surveyed[surveyed["Postal Region"] == property["Postal Region"]] + + # Take the 5 nearest + surveyed_similar = surveyed_similar.head(5) + + # perform a weighted mean of SAP rating - the closer the better + expected_sap = np.average( + surveyed_similar["Current SAP Rating"], weights=1 / (surveyed_similar["distance_meters"] + 1) + ) expected_epc = sap_to_epc(expected_sap) + if expected_epc in ["C", "B", "A"]: tier = "5 - EPC C or above" else: - tier = "3 - similar property" + tier = "3 - similar property, weighted on distance" final_missed_matches.append( { @@ -1891,6 +1934,121 @@ def propsed_wave_3_sample(): "Current EPC Band": expected_epc } ) + continue + + # if property["Property Type"].split(":")[0] in ["House", "Bungalow"]: + # filter_property_types = ["House", "Bungalow"] + # else: + # filter_property_types = ["Flat"] + # + # surveyed_similar = survey_results_with_original_features[ + # (survey_results_with_original_features["Postcode"] == property["Postcode"]) & + # ( + # survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( + # filter_property_types + # ) + # ) & + # ( + # survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + # property["Wall Type"].split(":")[0] + # ) & + # ( + # survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + # property["Roof Type"].split(":")[0] + # ) & + # ( + # survey_results_with_original_features["Heating"].str.split(":").str[0] == + # property["Heating"].split(":")[0] + # ) + # ] + # if surveyed_similar.empty: + # surveyed_similar = survey_results_with_original_features[ + # (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & + # (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( + # filter_property_types + # )) & + # (survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + # property["Wall Type"].split(":")[0]) & + # (survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + # property["Roof Type"].split(":")[0]) & + # (survey_results_with_original_features["Heating"].str.split(":").str[0] == + # property["Heating"].split(":")[0]) + # ] + # + # if surveyed_similar.empty: + # + # # We get an average based on the postcode + # surveyed_similar = survey_results_with_original_features[ + # (survey_results_with_original_features["Postal Region"] == property["Postal Region"]) & + # (survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( + # filter_property_types + # )) + # ] + # if surveyed_similar.empty: + # surveyed_similar_entire_population = survey_results_with_original_features[ + # ( + # survey_results_with_original_features["Property Type"].str.split(":").str[0] == property[ + # "Property Type"].split(":")[0] + # ) & + # ( + # survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + # property["Wall Type"].split(":")[0] + # ) & + # ( + # survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + # property["Roof Type"].split(":")[0] + # ) & + # ( + # survey_results_with_original_features["Heating"].str.split(":").str[0] == + # property["Heating"].split(":")[0] + # ) + # ] + # + # # We order them by distance on postcode + # + # # Average + # expected_sap = surveyed_similar_entire_population["Current SAP Rating"].mean() + # expected_epc = sap_to_epc(expected_sap) + # + # final_missed_matches.append( + # { + # "Address ID": a_id, + # "Confidence Tier": "3 - similar property, all areas searched", + # "Current EPC Band": expected_epc + # } + # + # ) + # else: + # expected_sap = surveyed_similar["Current SAP Rating"].mean() + # expected_epc = sap_to_epc(expected_sap) + # if expected_epc in ["C", "B", "A"]: + # tier = "5 - EPC C or above" + # else: + # tier = "3 - similar property, relaxed conditions" + # + # final_missed_matches.append( + # { + # "Address ID": a_id, + # "Confidence Tier": tier, + # "Current EPC Band": expected_epc + # } + # ) + # continue + # # We take an average + # expected_sap = surveyed_similar["Current SAP Rating"].mean() + # expected_epc = sap_to_epc(expected_sap) + # if expected_epc in ["C", "B", "A"]: + # tier = "5 - EPC C or above" + # else: + # tier = "3 - similar property" + # + # final_missed_matches.append( + # { + # "Address ID": a_id, + # "Confidence Tier": tier, + # "Current EPC Band": expected_epc + # } + # ) final_missed_matches = pd.DataFrame(final_missed_matches) @@ -1928,27 +2086,33 @@ def propsed_wave_3_sample(): # We create the gain and loss columns # Gain is the sum of these columns: - # '1 - Archetype surveyed', '1 - property was surveyed', - # '2 - same archetype', '3 - similar property', + # '1 - Archetype surveyed', + # '1 - property was surveyed', + # '2 - same archetype', + # '3 - similar property', + # '3 - similar property, all areas searched', + # '3 - similar property, relaxed conditions' + # # Loss is the sum of these columns: # '4 - no similar property, needs survey to confirm', # '5 - EPC C or above', '5 - property was surveyed' geographic_summary["Gain"] = geographic_summary[ - ['1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property'] + [ + '1 - Archetype surveyed', '1 - property was surveyed', '2 - same archetype', '3 - similar property', + '3 - similar property, all areas searched', '3 - similar property, relaxed conditions' + ] ].sum(axis=1) geographic_summary["Loss"] = geographic_summary[ - ['4 - no similar property, needs survey to confirm', '5 - EPC C or above', '5 - property was surveyed'] + ['5 - EPC C or above', '5 - property was surveyed'] ].sum(axis=1) - geographic_summary.sum() + print(geographic_summary.sum()) geographic_summary = geographic_summary.sort_values("Loss", ascending=True) geographic_summary["Loss Cumulative Sum"] = geographic_summary["Loss"].cumsum() geographic_summary[geographic_summary["Loss Cumulative Sum"] <= 250]["Gain"].sum() - geographic_summary[["Loss", "Gain"]].head() - loss = geographic_summary["Loss"].values gain = geographic_summary["Gain"].values