minor changes to add propety type as a breakdown in downloader

2026-07-27 23:35:01 +00:00 · 2023-06-30 09:32:58 +01:00 · 2023-06-30 09:32:58 +01:00 · b922d5a9b7
commit b922d5a9b7
parent abed2ce2de
1 changed files with 150 additions and 83 deletions
--- a/model_data/app.py
+++ b/model_data/app.py
@ -1,3 +1,4 @@
+import pandas as pd
 from tqdm import tqdm
 import os
 from model_data.BoreholeClient import BoreholeClient
@ -69,19 +70,23 @@ def handler():
        p.set_is_in_conservation_area(conservation_area_client)

    local_authorities = {p.data['local-authority'] for p in input_properties}
+    # TODO: Do this at a constituency level
+    constituencies = {p.data["constituency"] for p in input_properties}
+    property_types = ["bungalow", "flat", "house", "maisonette", "park home"]

-    # TODO: Create a more balanced sample where we grab more properties across different properties
-    #       types, as e.g. we're pulling many more flats than houses
+    # We pull properties from local authorities, by property type. This will allow us to build
+    # a dataset of up to 10k properties per local authority/property type combination
    data = []
    for la in tqdm(local_authorities):
-        data.extend(
-            pagenated_epc_download(
-                client=epc_client,
-                params={"local-authority": la},
-                page_size=5000,
-                n_pages=10,
+        for pt in property_types:
+            data.extend(
+                pagenated_epc_download(
+                    client=epc_client,
+                    params={"local-authority": la, "property-type": pt},
+                    page_size=5000,
+                    n_pages=10,
+                )
            )
-        )

    # Incorporate input data into cleaning
    cleaner = EpcClean(data + [p.data for p in input_properties])
@ -120,85 +125,147 @@ def handler():
    uvalue_estimates = UvalueEstimations(data=data)
    uvalue_estimates.get_estimates(cleaner=cleaner)

-    input_properties[4].data["address1"]
-    input_properties[4].data["postcode"]
-    floors_df["address1"].values[4]
-    floors_df["original_description"].values[4]
+    # all_data = {
+    #     "input_properties": input_properties,
+    #     "cleaner": cleaner,
+    #     "uvalue_estimates": uvalue_estimates,
+    #     "land_registry_client": land_registry_client,
+    #     "borehole_client": borehole_client,
+    #     "conservation_area_client": conservation_area_client,
+    #     "open_uprn_client": open_uprn_client,
+    #     "data": data
+    # }

-    df = pd.DataFrame(
-        [
-            x.data for x in input_properties
-        ]
-    )
-    df["property-type"].unique()
+    # import pickle
+    # with open("all_data.pkl", "wb") as f:
+    #     pickle.dump(all_data, f)

-    from model_data.recommendations.WallRecommendations import WallRecommendations
-    all_res = []
-    for p in input_properties:
-        inst = WallRecommendations(property_instance=p, uvalue_estimates=uvalue_estimates)
-        inst.recommend()
-        n_recs = len(inst.recommendations)
-        all_res.append(n_recs)
+    # input_properties[4].data["address1"]
+    # input_properties[4].data["postcode"]
+    # floors_df["address1"].values[4]
+    # floors_df["original_description"].values[4]
+    #
+    # df = pd.DataFrame(
+    #     [
+    #         x.data for x in input_properties
+    #     ]
+    # )
+    # df["property-type"].unique()
+    #
+    # from model_data.recommendations.WallRecommendations import WallRecommendations
+    # all_res = []
+    # for p in input_properties:
+    #     inst = WallRecommendations(property_instance=p, uvalue_estimates=uvalue_estimates)
+    #     inst.recommend()
+    #     n_recs = len(inst.recommendations)
+    #     all_res.append(n_recs)
+    #
+    # self = WallRecommendations(property_instance=input_properties[2], uvalue_estimates=uvalue_estimates)
+    # input_properties[6].walls
+    # self.recommend()
+    # df = pd.DataFrame(self.recommendations[0]["parts"])
+    # recommendations = pd.DataFrame(self.recommendations)
+    #
+    # from model_data.recommendations.FloorRecommendations import FloorRecommendations
+    # self = FloorRecommendations(property_instance=input_properties[4], uvalue_estimates=uvalue_estimates)
+    # self.recommendations
+    # self.recommend()
+    # self.recommendations
+    #
+    # # We need to deduce a U-value for "Good" energy effieciency
+    #
+    # mainheating = pd.DataFrame(
+    #     [{"address1": p.address1, "postcode": p.postcode, **p.main_heating} for p in input_properties])
+    # hotwater = pd.DataFrame([{"address1": p.address1, **p.hotwater} for p in input_properties])
+    #
+    # mainheating[["address1", "postcode"]]
+    #
+    # # TODO: I want to knwo what "Good" efficiency means for the description
+    # #  'Flat 28, 22 Adelina Grove' 'Solid brick, as built, insulated (assumed)'
+    # #    so to do this, filter on the local authority code and property type, where we have U
+    # #   values for the wall and take a median!
+    #
+    # p = input_properties[6]
+    # df = pd.DataFrame(data)
+    #
+    # res = []
+    # for p in input_properties:
+    #     distances = []
+    #     for borehole in tqdm(borehole_client.data, total=len(borehole_client.data)):
+    #         dist_meeters, _ = borehole_client.distance_between_bng_coords(
+    #             x1_bng=p.coordinates['x_coordinate'],
+    #             y1_bng=p.coordinates['y_coordinate'],
+    #             x2_bng=float(borehole['EASTING']),
+    #             y2_bng=float(borehole['NORTHING'])
+    #         )
+    #         distances.append(dist_meeters)
+    #
+    #     res.append(
+    #         {
+    #             "uprn": int(p.data["uprn"]),
+    #             "meters_to_nearest_borehole": min(distances)
+    #         }
+    #
+    #     )
+    # res = pd.DataFrame(res)
+    #
+    # properties_dataset = [
+    #     {
+    #         **p.data,
+    #         "in_conservation_area": p.in_conservation_area,
+    #         **p.coordinates,
+    #
+    #     } for p in input_properties
+    # ]
+    #
+    # properties_dataset = pd.DataFrame(properties_dataset)
+    # properties_dataset = properties_dataset.merge(res, on="uprn", how="left")
+    #
+    # properties_dataset.to_csv("properties_dataset.csv")

-    self = WallRecommendations(property_instance=input_properties[2], uvalue_estimates=uvalue_estimates)
-    input_properties[6].walls
-    self.recommend()
-    df = pd.DataFrame(self.recommendations[0]["parts"])
-    recommendations = pd.DataFrame(self.recommendations)
-
-    from model_data.recommendations.FloorRecommendations import FloorRecommendations
-    self = FloorRecommendations(property_instance=input_properties[4], uvalue_estimates=uvalue_estimates)
-    self.recommendations
-    self.recommend()
-    self.recommendations
-
-    # We need to deduce a U-value for "Good" energy effieciency
-
-    mainheating = pd.DataFrame(
-        [{"address1": p.address1, "postcode": p.postcode, **p.main_heating} for p in input_properties])
-    hotwater = pd.DataFrame([{"address1": p.address1, **p.hotwater} for p in input_properties])
-
-    mainheating[["address1", "postcode"]]
-
-    # TODO: I want to knwo what "Good" efficiency means for the description
-    #  'Flat 28, 22 Adelina Grove' 'Solid brick, as built, insulated (assumed)'
-    #    so to do this, filter on the local authority code and property type, where we have U
-    #   values for the wall and take a median!
-
-    p = input_properties[6]
+    # We test estimating gain
+    import pandas as pd
+    pd.set_option('display.max_rows', 500)
+    pd.set_option('display.max_columns', 500)
+    pd.set_option('display.width', 1000)
    df = pd.DataFrame(data)

-    res = []
-    for p in input_properties:
-        distances = []
-        for borehole in tqdm(borehole_client.data, total=len(borehole_client.data)):
-            dist_meeters, _ = borehole_client.distance_between_bng_coords(
-                x1_bng=p.coordinates['x_coordinate'],
-                y1_bng=p.coordinates['y_coordinate'],
-                x2_bng=float(borehole['EASTING']),
-                y2_bng=float(borehole['NORTHING'])
-            )
-            distances.append(dist_meeters)
-
-        res.append(
-            {
-                "uprn": int(p.data["uprn"]),
-                "meters_to_nearest_borehole": min(distances)
-            }
-
-        )
-    res = pd.DataFrame(res)
-
-    properties_dataset = [
-        {
-            **p.data,
-            "in_conservation_area": p.in_conservation_area,
-            **p.coordinates,
-
-        } for p in input_properties
+    # We want to estimate for making improvements on different property components
+    response = "environment-impact-current"
+    base_features = [
+        "property-type",
+        "built-form",
+        # "construction-age-band",
+        "number-habitable-rooms",
    ]

-    properties_dataset = pd.DataFrame(properties_dataset)
-    properties_dataset = properties_dataset.merge(res, on="uprn", how="left")
+    component_features = [
+        "walls-description",
+        "floor-description",
+    ]

-    properties_dataset.to_csv("properties_dataset.csv")
+    model_data = df[[response] + component_features + base_features]
+    model_data = model_data.reset_index()
+    model_data["idx"] = model_data.index.copy()
+    summary = (
+        model_data
+        .groupby(component_features + base_features)
+        .agg({response: 'median', "idx": 'size'})
+        .reset_index()
+    )
+
+    summary = summary.sort_values("walls-description")
+
+    example = summary[
+        (summary["walls-description"].isin(
+            [
+                "Solid brick, as built, no insulation (assumed)",
+                "Solid brick, as built, partial insulation (assumed)",
+                "Solid brick, as built, insulated (assumed)",
+            ]
+        )) &
+        (summary["property-type"] == "House") &
+        (summary["built-form"] == "Detached") &
+        # (summary["construction-age-band"] == "England and Wales: 1976-1982")
+        (summary["number-habitable-rooms"] == "4")
+        ]