diff --git a/model_data/app.py b/model_data/app.py index 2edd832d..a19635e3 100644 --- a/model_data/app.py +++ b/model_data/app.py @@ -1,3 +1,4 @@ +import pandas as pd from tqdm import tqdm import os from model_data.BoreholeClient import BoreholeClient @@ -69,19 +70,23 @@ def handler(): p.set_is_in_conservation_area(conservation_area_client) local_authorities = {p.data['local-authority'] for p in input_properties} + # TODO: Do this at a constituency level + constituencies = {p.data["constituency"] for p in input_properties} + property_types = ["bungalow", "flat", "house", "maisonette", "park home"] - # TODO: Create a more balanced sample where we grab more properties across different properties - # types, as e.g. we're pulling many more flats than houses + # We pull properties from local authorities, by property type. This will allow us to build + # a dataset of up to 10k properties per local authority/property type combination data = [] for la in tqdm(local_authorities): - data.extend( - pagenated_epc_download( - client=epc_client, - params={"local-authority": la}, - page_size=5000, - n_pages=10, + for pt in property_types: + data.extend( + pagenated_epc_download( + client=epc_client, + params={"local-authority": la, "property-type": pt}, + page_size=5000, + n_pages=10, + ) ) - ) # Incorporate input data into cleaning cleaner = EpcClean(data + [p.data for p in input_properties]) @@ -120,85 +125,147 @@ def handler(): uvalue_estimates = UvalueEstimations(data=data) uvalue_estimates.get_estimates(cleaner=cleaner) - input_properties[4].data["address1"] - input_properties[4].data["postcode"] - floors_df["address1"].values[4] - floors_df["original_description"].values[4] + # all_data = { + # "input_properties": input_properties, + # "cleaner": cleaner, + # "uvalue_estimates": uvalue_estimates, + # "land_registry_client": land_registry_client, + # "borehole_client": borehole_client, + # "conservation_area_client": conservation_area_client, + # "open_uprn_client": open_uprn_client, + # "data": data + # } - df = pd.DataFrame( - [ - x.data for x in input_properties - ] - ) - df["property-type"].unique() + # import pickle + # with open("all_data.pkl", "wb") as f: + # pickle.dump(all_data, f) - from model_data.recommendations.WallRecommendations import WallRecommendations - all_res = [] - for p in input_properties: - inst = WallRecommendations(property_instance=p, uvalue_estimates=uvalue_estimates) - inst.recommend() - n_recs = len(inst.recommendations) - all_res.append(n_recs) + # input_properties[4].data["address1"] + # input_properties[4].data["postcode"] + # floors_df["address1"].values[4] + # floors_df["original_description"].values[4] + # + # df = pd.DataFrame( + # [ + # x.data for x in input_properties + # ] + # ) + # df["property-type"].unique() + # + # from model_data.recommendations.WallRecommendations import WallRecommendations + # all_res = [] + # for p in input_properties: + # inst = WallRecommendations(property_instance=p, uvalue_estimates=uvalue_estimates) + # inst.recommend() + # n_recs = len(inst.recommendations) + # all_res.append(n_recs) + # + # self = WallRecommendations(property_instance=input_properties[2], uvalue_estimates=uvalue_estimates) + # input_properties[6].walls + # self.recommend() + # df = pd.DataFrame(self.recommendations[0]["parts"]) + # recommendations = pd.DataFrame(self.recommendations) + # + # from model_data.recommendations.FloorRecommendations import FloorRecommendations + # self = FloorRecommendations(property_instance=input_properties[4], uvalue_estimates=uvalue_estimates) + # self.recommendations + # self.recommend() + # self.recommendations + # + # # We need to deduce a U-value for "Good" energy effieciency + # + # mainheating = pd.DataFrame( + # [{"address1": p.address1, "postcode": p.postcode, **p.main_heating} for p in input_properties]) + # hotwater = pd.DataFrame([{"address1": p.address1, **p.hotwater} for p in input_properties]) + # + # mainheating[["address1", "postcode"]] + # + # # TODO: I want to knwo what "Good" efficiency means for the description + # # 'Flat 28, 22 Adelina Grove' 'Solid brick, as built, insulated (assumed)' + # # so to do this, filter on the local authority code and property type, where we have U + # # values for the wall and take a median! + # + # p = input_properties[6] + # df = pd.DataFrame(data) + # + # res = [] + # for p in input_properties: + # distances = [] + # for borehole in tqdm(borehole_client.data, total=len(borehole_client.data)): + # dist_meeters, _ = borehole_client.distance_between_bng_coords( + # x1_bng=p.coordinates['x_coordinate'], + # y1_bng=p.coordinates['y_coordinate'], + # x2_bng=float(borehole['EASTING']), + # y2_bng=float(borehole['NORTHING']) + # ) + # distances.append(dist_meeters) + # + # res.append( + # { + # "uprn": int(p.data["uprn"]), + # "meters_to_nearest_borehole": min(distances) + # } + # + # ) + # res = pd.DataFrame(res) + # + # properties_dataset = [ + # { + # **p.data, + # "in_conservation_area": p.in_conservation_area, + # **p.coordinates, + # + # } for p in input_properties + # ] + # + # properties_dataset = pd.DataFrame(properties_dataset) + # properties_dataset = properties_dataset.merge(res, on="uprn", how="left") + # + # properties_dataset.to_csv("properties_dataset.csv") - self = WallRecommendations(property_instance=input_properties[2], uvalue_estimates=uvalue_estimates) - input_properties[6].walls - self.recommend() - df = pd.DataFrame(self.recommendations[0]["parts"]) - recommendations = pd.DataFrame(self.recommendations) - - from model_data.recommendations.FloorRecommendations import FloorRecommendations - self = FloorRecommendations(property_instance=input_properties[4], uvalue_estimates=uvalue_estimates) - self.recommendations - self.recommend() - self.recommendations - - # We need to deduce a U-value for "Good" energy effieciency - - mainheating = pd.DataFrame( - [{"address1": p.address1, "postcode": p.postcode, **p.main_heating} for p in input_properties]) - hotwater = pd.DataFrame([{"address1": p.address1, **p.hotwater} for p in input_properties]) - - mainheating[["address1", "postcode"]] - - # TODO: I want to knwo what "Good" efficiency means for the description - # 'Flat 28, 22 Adelina Grove' 'Solid brick, as built, insulated (assumed)' - # so to do this, filter on the local authority code and property type, where we have U - # values for the wall and take a median! - - p = input_properties[6] + # We test estimating gain + import pandas as pd + pd.set_option('display.max_rows', 500) + pd.set_option('display.max_columns', 500) + pd.set_option('display.width', 1000) df = pd.DataFrame(data) - res = [] - for p in input_properties: - distances = [] - for borehole in tqdm(borehole_client.data, total=len(borehole_client.data)): - dist_meeters, _ = borehole_client.distance_between_bng_coords( - x1_bng=p.coordinates['x_coordinate'], - y1_bng=p.coordinates['y_coordinate'], - x2_bng=float(borehole['EASTING']), - y2_bng=float(borehole['NORTHING']) - ) - distances.append(dist_meeters) - - res.append( - { - "uprn": int(p.data["uprn"]), - "meters_to_nearest_borehole": min(distances) - } - - ) - res = pd.DataFrame(res) - - properties_dataset = [ - { - **p.data, - "in_conservation_area": p.in_conservation_area, - **p.coordinates, - - } for p in input_properties + # We want to estimate for making improvements on different property components + response = "environment-impact-current" + base_features = [ + "property-type", + "built-form", + # "construction-age-band", + "number-habitable-rooms", ] - properties_dataset = pd.DataFrame(properties_dataset) - properties_dataset = properties_dataset.merge(res, on="uprn", how="left") + component_features = [ + "walls-description", + "floor-description", + ] - properties_dataset.to_csv("properties_dataset.csv") + model_data = df[[response] + component_features + base_features] + model_data = model_data.reset_index() + model_data["idx"] = model_data.index.copy() + summary = ( + model_data + .groupby(component_features + base_features) + .agg({response: 'median', "idx": 'size'}) + .reset_index() + ) + + summary = summary.sort_values("walls-description") + + example = summary[ + (summary["walls-description"].isin( + [ + "Solid brick, as built, no insulation (assumed)", + "Solid brick, as built, partial insulation (assumed)", + "Solid brick, as built, insulated (assumed)", + ] + )) & + (summary["property-type"] == "House") & + (summary["built-form"] == "Detached") & + # (summary["construction-age-band"] == "England and Wales: 1976-1982") + (summary["number-habitable-rooms"] == "4") + ]