diff --git a/model_data/Property.py b/backend/Property.py similarity index 100% rename from model_data/Property.py rename to backend/Property.py diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index aa8655bf..4934042e 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -3,7 +3,7 @@ from backend.app.dependencies import validate_token from backend.app.plan.schemas import PlanTriggerRequest from backend.app.utils import read_csv_from_s3 from backend.app.config import get_settings -from model_data.Property import Property +from backend.Property import Property from epc_api.client import EpcClient from utils.logger import setup_logger from recommendations.FloorRecommendations import FloorRecommendations diff --git a/model_data/tests/test_property.py b/backend/tests/test_property.py similarity index 100% rename from model_data/tests/test_property.py rename to backend/tests/test_property.py diff --git a/model_data/analysis/SapModel.py b/model_data/analysis/SapModel.py index e39296ce..021e503d 100644 --- a/model_data/analysis/SapModel.py +++ b/model_data/analysis/SapModel.py @@ -2,8 +2,7 @@ import numpy as np import pandas as pd import statsmodels.api as sm import matplotlib.pyplot as plt -import pickle -from typing import Any, Dict, Tuple, Optional, List +from typing import Dict, Optional, List from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, \ median_absolute_error, mean_absolute_percentage_error @@ -18,10 +17,6 @@ from utils.logger import setup_logger logger = setup_logger() -# with open("all_data.pkl", "rb") as f: -# all_data = pickle.load(f) - - class SapModel: # We want to estimate for making improvements on different property components RESPONSE = "current-energy-efficiency" diff --git a/model_data/app.py b/model_data/app.py index fe065f26..bfe11ce3 100644 --- a/model_data/app.py +++ b/model_data/app.py @@ -1,16 +1,12 @@ from tqdm import tqdm import os -from model_data.BoreholeClient import BoreholeClient -from model_data.LandRegistryClient import LandRegistryClient -from model_data.temp_inputs import input_data -from model_data.Property import Property from model_data.config import EPC_AUTH_TOKEN from epc_api.client import EpcClient from model_data.downloader import pagenated_epc_download from model_data.EpcClean import EpcClean -from open_uprn.OpenUprnClient import OpenUprnClient from model_data.analysis.UvalueEstimations import UvalueEstimations +from model_data.analysis.SapModel import SapModel LAND_REGISTRY_PATHS = [ os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv", @@ -24,345 +20,6 @@ LAND_REGISTRY_PATHS = [ ] -def handler(): - # To begin with, the input data is a list of dictionaries, however we would read this file in - - epc_client = EpcClient(auth_token=EPC_AUTH_TOKEN) - - input_properties = [ - Property(postcode=config['postcode'], address1=config['address1'], epc_client=epc_client) - for config in input_data - ] - - for p in input_properties: - p.search_address_epc() - p.set_year_built() - - uprns = [p.data['uprn'] for p in input_properties] - - open_uprn_client = OpenUprnClient( - path=os.path.abspath( - os.path.dirname(__file__) - ) + "/model_data/local_data/osopenuprn_202306_csv/osopenuprn_202305.csv", - uprns=uprns - ) - open_uprn_client.read() - - # We're using Ordinance Survey Open Uprn data - # to find the coordinates of each address, which we will then be able to use at a later stage - for p in input_properties: - p.get_coordinates(open_uprn_client) - - conservation_area_client = ConservationAreaClient( - historic_england_path=os.path.abspath( - os.path.dirname(__file__) - ) + "/model_data/local_data/Historic_Eng_Conservation_Areas/Conservation_Areas.shp", - gov_path=os.path.abspath( - os.path.dirname(__file__) - ) + "/model_data/local_data/gov-conservation-area.geojson" - ) - conservation_area_client.read() - - # Check if the property is in a conversation area - for p in input_properties: - in_conservation_area = conservation_area_client.is_in_conservation_area(p.coordinates) - p.set_is_in_conservation_area(in_conservation_area) - - local_authorities = {p.data['local-authority'] for p in input_properties} - # TODO: Do this at a constituency level - constituencies = {p.data["constituency"] for p in input_properties} - property_types = ["bungalow", "flat", "house", "maisonette", "park home"] - floor_areas = ["unknown", "s", "m", "l", "xl", "xxl", "xxxl"] - - # We pull properties from local authorities, by property type. This will allow us to build - # a dataset of up to 10k properties per local authority/property type combination - # For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were - # conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England - # and Wales from 31 July 2014 - # Download data from August 2014 onwards - data = [] - for c in tqdm(constituencies): - for pt in property_types: - for fa in floor_areas: - data.extend( - pagenated_epc_download( - client=epc_client, - params={ - "constituency": c, - "property-type": pt, - "from-month": 8, - "from-year": 2014, - "floor-area": fa, - }, - page_size=5000, - n_pages=10, - ) - ) - - # Incorporate input data into cleaning - cleaner = EpcClean(data + [p.data for p in input_properties]) - cleaner.clean() - - z = [x for x in data if x["floor-description"] == "(anheddiad arall islaw)"] - - address_meta = [ - { - "postcode": x["postcode"].upper(), - "address1": x["address1"].upper(), - "address2": x["address2"].upper(), - "address3": x["address3"].upper(), - "address": x["address"], - "uprn": x["uprn"] - } for x in data - ] - - import pickle - with open("sample_addresses.pkl", "wb") as f: - pickle.dump(address_meta, f) - - # Land registry - land_registry_client = LandRegistryClient( - paths=LAND_REGISTRY_PATHS, - addresses=address_meta - ) - lr_data = land_registry_client.read() - - # Borehole - borehole_client = BoreholeClient( - path=os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/borehole/borehole.dbf" - ) - borehole_client.read() - - # Now, for our input properties, we need to identify the components of the building, based - # on the cleaning we've done - for p in input_properties: - p.get_components(cleaner) - - # TODO: Add property age band into this - uvalue_estimates = UvalueEstimations(data=data) - uvalue_estimates.get_estimates(cleaner=cleaner) - - x = {'low-energy-fixed-light-count': '', 'address': 'Flat 28, 22, Adelina Grove', 'uprn-source': 'Address Matched', - 'floor-height': '', 'heating-cost-potential': '668', 'unheated-corridor-length': '7.73', - 'hot-water-cost-potential': '190', 'construction-age-band': 'England and Wales: 1991-1995', - 'potential-energy-rating': 'D', 'mainheat-energy-eff': 'Very Poor', 'windows-env-eff': 'Average', - 'lighting-energy-eff': 'Average', 'environment-impact-potential': '46', - 'glazed-type': 'double glazing, unknown install date', 'heating-cost-current': '1081', 'address3': '', - 'mainheatcont-description': 'No time or thermostatic control of room temperature', - 'sheating-energy-eff': 'N/A', 'property-type': 'Flat', 'local-authority-label': 'Tower Hamlets', - 'fixed-lighting-outlets-count': '', 'energy-tariff': 'dual', 'mechanical-ventilation': 'natural', - 'hot-water-cost-current': '190', 'county': 'Greater London Authority', 'postcode': 'E1 3BX', - 'solar-water-heating-flag': 'N', 'constituency': 'E14000555', 'co2-emissions-potential': '5.2', - 'number-heated-rooms': '2', 'floor-description': '(another dwelling below)', - 'energy-consumption-potential': '301', 'local-authority': 'E09000030', 'built-form': 'Semi-Detached', - 'number-open-fireplaces': '0', 'windows-description': 'Fully double glazed', 'glazed-area': 'Normal', - 'inspection-date': '2018-09-05', 'mains-gas-flag': 'N', 'co2-emiss-curr-per-floor-area': '53', - 'address1': 'Flat 28', 'heat-loss-corridor': 'unheated corridor', 'flat-storey-count': '', - 'constituency-label': 'Bethnal Green and Bow', 'roof-energy-eff': 'Average', 'total-floor-area': '103.0', - 'building-reference-number': '4441803568', 'environment-impact-current': '44', 'co2-emissions-current': '5.5', - 'roof-description': 'Pitched, insulated (assumed)', 'floor-energy-eff': 'NO DATA!', - 'number-habitable-rooms': '2', 'address2': '22, Adelina Grove', 'hot-water-env-eff': 'Poor', - 'posttown': 'LONDON', 'mainheatc-energy-eff': 'Very Poor', 'main-fuel': 'electricity (not community)', - 'lighting-env-eff': 'Average', 'windows-energy-eff': 'Average', 'floor-env-eff': 'N/A', - 'sheating-env-eff': 'N/A', 'lighting-description': 'Low energy lighting in 25% of fixed outlets', - 'roof-env-eff': 'Average', 'walls-energy-eff': 'Good', 'photo-supply': '', 'lighting-cost-potential': '84', - 'mainheat-env-eff': 'Very Poor', 'multi-glaze-proportion': '100', 'main-heating-controls': '2701', - 'lodgement-datetime': '2018-09-06 17:25:59', 'flat-top-storey': 'Y', 'current-energy-rating': 'E', - 'secondheat-description': 'None', 'walls-env-eff': 'Good', 'transaction-type': 'rental (private)', - 'uprn': '6032920', 'current-energy-efficiency': '48', 'energy-consumption-current': '316', - 'mainheat-description': 'Electric ceiling heating', 'lighting-cost-current': '147', - 'lodgement-date': '2018-09-06', 'extension-count': '1', 'mainheatc-env-eff': 'Very Poor', - 'lmk-key': '175926409402018090617255958380158', 'wind-turbine-count': '0', 'tenure': 'rental (private)', - 'floor-level': '4th', 'potential-energy-efficiency': '67', 'hot-water-energy-eff': 'Average', - 'low-energy-lighting': '25', 'walls-description': 'Solid brick, as built, insulated (assumed)', - 'hotwater-description': 'Electric immersion, off-peak'} - from utils.uvalue_estimates import classify_decile_newvalues - total_floor_area_group_decile = UvalueEstimations.classify_decile_newvalues( - decile_boundaries=uvalue_estimates.walls_decile_data["decile_boundaries"], - decile_labels=uvalue_estimates.walls_decile_data["decile_labels"], - new_values=[float(x["total-floor-area"])], - )[0] - - u_value_estimate = uvalue_estimates.walls[ - (uvalue_estimates.walls["local-authority"] == x["local-authority"]) & - (uvalue_estimates.walls["property-type"] == x["property-type"]) & - (uvalue_estimates.walls["built-form"] == x["built-form"]) & - (uvalue_estimates.walls["walls-energy-eff"] == x["walls-energy-eff"]) & - (uvalue_estimates.walls["walls-env-eff"] == x["walls-env-eff"]) & - (uvalue_estimates.walls["total-floor-area_group"] == total_floor_area_group_decile) - ] - - uvalue_estimates.walls[ - uvalue_estimates.walls - ] - - # all_data = { - # "input_properties": input_properties, - # "cleaner": cleaner, - # "uvalue_estimates": uvalue_estimates, - # "land_registry_client": land_registry_client, - # "borehole_client": borehole_client, - # "conservation_area_client": conservation_area_client, - # "open_uprn_client": open_uprn_client, - # "data": data - # } - - # import pickle - # with open("all_data.pkl", "wb") as f: - # pickle.dump(all_data, f) - - # input_properties[4].data["address1"] - # input_properties[4].data["postcode"] - # floors_df["address1"].values[4] - # floors_df["original_description"].values[4] - # - # df = pd.DataFrame( - # [ - # x.data for x in input_properties - # ] - # ) - # df["property-type"].unique() - # - # from model_data.recommendations.WallRecommendations import WallRecommendations - # all_res = [] - # for p in input_properties: - # inst = WallRecommendations(property_instance=p, uvalue_estimates=uvalue_estimates) - # inst.recommend() - # n_recs = len(inst.recommendations) - # all_res.append(n_recs) - # - # self = WallRecommendations(property_instance=input_properties[2], uvalue_estimates=uvalue_estimates) - # input_properties[6].walls - # self.recommend() - # df = pd.DataFrame(self.recommendations[0]["parts"]) - # recommendations = pd.DataFrame(self.recommendations) - # - # from model_data.recommendations.FloorRecommendations import FloorRecommendations - # self = FloorRecommendations(property_instance=input_properties[4], uvalue_estimates=uvalue_estimates) - # self.recommendations - # self.recommend() - # self.recommendations - # - # # We need to deduce a U-value for "Good" energy effieciency - # - # mainheating = pd.DataFrame( - # [{"address1": p.address1, "postcode": p.postcode, **p.main_heating} for p in input_properties]) - # hotwater = pd.DataFrame([{"address1": p.address1, **p.hotwater} for p in input_properties]) - # - # mainheating[["address1", "postcode"]] - # - # # TODO: I want to knwo what "Good" efficiency means for the description - # # 'Flat 28, 22 Adelina Grove' 'Solid brick, as built, insulated (assumed)' - # # so to do this, filter on the local authority code and property type, where we have U - # # values for the wall and take a median! - # - # p = input_properties[6] - # df = pd.DataFrame(data) - # - # res = [] - # for p in input_properties: - # distances = [] - # for borehole in tqdm(borehole_client.data, total=len(borehole_client.data)): - # dist_meeters, _ = borehole_client.distance_between_bng_coords( - # x1_bng=p.coordinates['x_coordinate'], - # y1_bng=p.coordinates['y_coordinate'], - # x2_bng=float(borehole['EASTING']), - # y2_bng=float(borehole['NORTHING']) - # ) - # distances.append(dist_meeters) - # - # res.append( - # { - # "uprn": int(p.data["uprn"]), - # "meters_to_nearest_borehole": min(distances) - # } - # - # ) - # res = pd.DataFrame(res) - # - # properties_dataset = [ - # { - # **p.data, - # "in_conservation_area": p.in_conservation_area, - # **p.coordinates, - # - # } for p in input_properties - # ] - # - # properties_dataset = pd.DataFrame(properties_dataset) - # properties_dataset = properties_dataset.merge(res, on="uprn", how="left") - # - # properties_dataset.to_csv("properties_dataset.csv") - - # We test estimating gain - import pandas as pd - pd.set_option('display.max_rows', 500) - pd.set_option('display.max_columns', 500) - pd.set_option('display.width', 1000) - df = pd.DataFrame(data) - - # We need to split the data into a train and test set for model build - - # If these categorical variables are not of type 'category', convert them - - print(results.summary()) - - grouped_error = [] - groupby = ["mainheat-description"] - for group, data in model_data.groupby(groupby, observed=True): - group_fit_error, _ = calculate_regression_metrics(y_true=data[response].astype(float), y_pred=data["fit"]) - # plot_regression(pd.DataFrame({"fit": data["fit"].values, "actual": data[response].astype(float).values})) - grouped_error.append( - { - **dict(zip(groupby, group)), - "n_samples": data.shape[0], - **group_fit_error, - } - ) - - grouped_error = pd.DataFrame(grouped_error) - grouped_error = grouped_error.sort_values("R2 Score", ascending=True) - - plot_regression(fit_df) - - model_data[["thermal_transmittance", response]].corr() - - summary = model_data.groupby(["property-type", "built-form"], observed=True)[ - ["thermal_transmittance", response] - ].corr() - - summary = ( - model_data - .groupby(component_features + base_features) - .agg({response: 'median', "idx": 'size'}) - .reset_index() - ) - - summary = summary.sort_values("walls-description") - - example = summary[ - (summary["walls-description"].isin( - [ - "Solid brick, as built, no insulation (assumed)", - "Solid brick, as built, partial insulation (assumed)", - "Solid brick, as built, insulated (assumed)", - ] - )) & - (summary["property-type"] == "House") & - (summary["built-form"] == "Detached") & - # (summary["construction-age-band"] == "England and Wales: 1976-1982") - (summary["number-habitable-rooms"] == "4") - ] - - from textblob import TextBlob - converter = TextBlob("excelent lighting in this hosehold") - - from model_data.utils import correct_spelling - result = correct_spelling("excelent lighting in this hosehold") - print(result) - 'excellent lighting in this household' - - def app(): """ For a pre-defined list of constituencies and property types, we'll download EPC data from the API @@ -425,4 +82,6 @@ def app(): uvalue_estimates.get_estimates(cleaner=cleaner) # TODO: Store these to a db - uvalue_estimates.floors_decile_data + sap_model = SapModel(data=data, cleaner=cleaner) + sap_model.run() + # TODO: Store outputs to db diff --git a/model_data/requirements/requirements.txt b/model_data/requirements/requirements.txt index 1bc54bc7..28fce331 100644 --- a/model_data/requirements/requirements.txt +++ b/model_data/requirements/requirements.txt @@ -18,4 +18,3 @@ statsmodels scikit-learn pyspellchecker textblob -xgboost diff --git a/model_data/requirements/static.txt b/model_data/requirements/static.txt index 95a6a6dd..55b449c1 100644 --- a/model_data/requirements/static.txt +++ b/model_data/requirements/static.txt @@ -1,3 +1,2 @@ -xgboost statsmodels scikit-learn diff --git a/recommendations/FloorRecommendations.py b/recommendations/FloorRecommendations.py index afdea35f..2cf90c0f 100644 --- a/recommendations/FloorRecommendations.py +++ b/recommendations/FloorRecommendations.py @@ -1,7 +1,7 @@ import math from typing import List from model_data.BaseUtility import BaseUtility -from model_data.Property import Property +from backend.Property import Property from model_data.rdsap_tables import default_wall_thickness, age_band_data from recommendations.recommendation_utils import ( r_value_per_mm_to_u_value, calculate_u_value_uplift, is_diminishing_returns, update_lowest_selected_u_value, diff --git a/recommendations/WallRecommendations.py b/recommendations/WallRecommendations.py index f729fc82..35a5b022 100644 --- a/recommendations/WallRecommendations.py +++ b/recommendations/WallRecommendations.py @@ -1,7 +1,7 @@ import itertools import math -from model_data.Property import Property +from backend.Property import Property from model_data.BaseUtility import BaseUtility from recommendations.recommendation_utils import ( r_value_per_mm_to_u_value, calculate_u_value_uplift, is_diminishing_returns, update_lowest_selected_u_value, diff --git a/recommendations/recommendation_utils.py b/recommendations/recommendation_utils.py index 9d09a13a..ae906194 100644 --- a/recommendations/recommendation_utils.py +++ b/recommendations/recommendation_utils.py @@ -1,5 +1,5 @@ from copy import deepcopy -from model_data.Property import Property +from backend.Property import Property from statistics import mean diff --git a/recommendations/tests/test_data/input_properties.pkl b/recommendations/tests/test_data/input_properties.pkl index 09aa1dc9..d21b89c2 100644 Binary files a/recommendations/tests/test_data/input_properties.pkl and b/recommendations/tests/test_data/input_properties.pkl differ diff --git a/recommendations/tests/test_floor_recommendations.py b/recommendations/tests/test_floor_recommendations.py index 8e265acc..ac9c7380 100644 --- a/recommendations/tests/test_floor_recommendations.py +++ b/recommendations/tests/test_floor_recommendations.py @@ -5,6 +5,17 @@ from unittest.mock import Mock from recommendations.FloorRecommendations import FloorRecommendations +# with open( +# os.path.abspath(os.path.dirname(__file__)) + "/recommendations/tests/test_data/input_properties.pkl", "rb" +# ) as f: +# input_properties = pickle.load(f) +# +# with open( +# os.path.abspath(os.path.dirname(__file__)) + "/recommendations/tests/test_data/uvalue_estimates.pkl", "rb" +# ) as f: +# uvalue_estimates = pickle.load(f) + + class TestWallRecommendations: @pytest.fixture @@ -106,7 +117,7 @@ class TestWallRecommendations: recommender.recommend() assert not recommender.property.floor["is_suspended"] assert recommender.property.floor["is_solid"] - assert recommender.estimated_u_value == 0.7361642182695053 + assert recommender.estimated_u_value == 0.7528014214215474 assert recommender.recommendations types = {part["type"] for x in recommender.recommendations for part in x["parts"]} diff --git a/recommendations/tests/test_wall_recommendations.py b/recommendations/tests/test_wall_recommendations.py index 2aa751cb..c053de03 100644 --- a/recommendations/tests/test_wall_recommendations.py +++ b/recommendations/tests/test_wall_recommendations.py @@ -7,7 +7,7 @@ import numpy as np from unittest.mock import Mock, MagicMock from recommendations.WallRecommendations import WallRecommendations from model_data.analysis.UvalueEstimations import UvalueEstimations -from model_data.Property import Property +from backend.Property import Property from recommendations.recommendation_utils import is_diminishing_returns diff --git a/serverless.yml b/serverless.yml index 1da36d33..07ff3e97 100644 --- a/serverless.yml +++ b/serverless.yml @@ -28,21 +28,31 @@ provider: package: individually: true - # include: - # - backend/** - # # Might need to refine the paths that are included - # - model_data/** - exclude: - - model_data/local_data/** - - model_data/tests/** - - infrastructure/** - - data_collection/** - - node_modules/** - - conservation_areas/** - - open_uprn/** - - land_registry/** - - recommendations/tests/** - - pytest.ini + patterns: + - backend/** + - !backend/tests/** + - recommendations/** + - !recommendations/tests/** + - model_data/BaseUtility.py + - !model_data/** + - !infrastructure/** + - !data_collection/** + - !node_modules/** + - !conservation_areas/** + - !open_uprn/** + - !land_registry/** + - !pytest.ini + # exclude: + # - model_data/local_data/** + # - model_data/tests/** + # - infrastructure/** + # - data_collection/** + # - node_modules/** + # - conservation_areas/** + # - open_uprn/** + # - land_registry/** + # - recommendations/tests/** + # - pytest.ini plugins: - serverless-python-requirements