diff --git a/model_data/app.py b/model_data/app.py index e6761121..8d167993 100644 --- a/model_data/app.py +++ b/model_data/app.py @@ -1,12 +1,14 @@ from tqdm import tqdm import os +import pandas as pd from model_data.config import EPC_AUTH_TOKEN from epc_api.client import EpcClient from model_data.downloader import pagenated_epc_download from model_data.EpcClean import EpcClean from model_data.analysis.UvalueEstimations import UvalueEstimations -from model_data.analysis.SapModel import SapModel +from model_data.simulation_system.core.Settings import EARLIEST_EPC_DATE +from pathlib import Path LAND_REGISTRY_PATHS = [ os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-monthly-update-new-version.csv", @@ -19,6 +21,8 @@ LAND_REGISTRY_PATHS = [ os.path.abspath(os.path.dirname(__file__)) + "/model_data/local_data/pp-2017-part2.csv", ] +EPC_DIRECTORY = Path(__file__).parent / "model_data" / "simulation_system" / "data" / "all-domestic-certificates" + def app(): """ @@ -28,36 +32,36 @@ def app(): :return: """ - epc_client = EpcClient(auth_token=EPC_AUTH_TOKEN) - - constituencies = {'E14000555', 'E14000726', 'E14000720', 'E14000721', 'E14000553', 'E14000752'} - property_types = ["bungalow", "flat", "house", "maisonette", "park home"] - floor_areas = ["unknown", "s", "m", "l", "xl", "xxl", "xxxl"] - - # We pull properties from local authorities, by property type. This will allow us to build - # a dataset of up to 10k properties per local authority/property type combination - # For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were - # conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England - # and Wales from 31 July 2014 - # Download data from August 2014 onwards - data = [] - for c in tqdm(constituencies): - for pt in property_types: - for fa in floor_areas: - data.extend( - pagenated_epc_download( - client=epc_client, - params={ - "constituency": c, - "property-type": pt, - "from-month": 8, - "from-year": 2014, - "floor-area": fa, - }, - page_size=5000, - n_pages=10, - ) - ) + # epc_client = EpcClient(auth_token=EPC_AUTH_TOKEN) + # + # constituencies = {'E14000555', 'E14000726', 'E14000720', 'E14000721', 'E14000553', 'E14000752'} + # property_types = ["bungalow", "flat", "house", "maisonette", "park home"] + # floor_areas = ["unknown", "s", "m", "l", "xl", "xxl", "xxxl"] + # + # # We pull properties from local authorities, by property type. This will allow us to build + # # a dataset of up to 10k properties per local authority/property type combination + # # For particularly old EPC data, we have inconsistent records so we'll only include EPCS that were + # # conducted after 2010, since SAP09 was introduced in 2009 an later SAP12 was introduced in England + # # and Wales from 31 July 2014 + # # Download data from August 2014 onwards + # data = [] + # for c in tqdm(constituencies): + # for pt in property_types: + # for fa in floor_areas: + # data.extend( + # pagenated_epc_download( + # client=epc_client, + # params={ + # "constituency": c, + # "property-type": pt, + # "from-month": 8, + # "from-year": 2014, + # "floor-area": fa, + # }, + # page_size=5000, + # n_pages=10, + # ) + # ) # Production of sample data for land registry # address_meta = [ @@ -75,20 +79,32 @@ def app(): # with open("sample_addresses.pkl", "wb") as f: # pickle.dump(address_meta, f) - # Incorporate input data into cleaning - cleaner = EpcClean(data) - lighting_averages = cleaner.lighting_averages - # TODO: WE need to store lighting_averages to a db - # We should also extend these averages so they're by more variables (property type, age band, constituency, - # etc) - cleaner.clean() - # TODO: cleaner.cleaned datasets to a db + epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()] + for directory in epc_directories: + data = pd.read_csv(directory / "certificates.csv", low_memory=False) + # Rename the columns to the same format as the api returns + data.columns = [c.replace("_", "-").lower() for c in data.columns] + # Take just date before the date threshold + data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE] - # TODO: Add property age band into this - uvalue_estimates = UvalueEstimations(data=data) - uvalue_estimates.get_estimates(cleaner=cleaner) - # TODO: Store these to a db + # Convert to list of dictioaries as returned by the api + data = data.to_dict("records") - sap_model = SapModel(data=data, cleaner=cleaner) - sap_model.run() - # TODO: Store outputs to db + # Incorporate input data into cleaning + cleaner = EpcClean(data) + lighting_averages = cleaner.lighting_averages + # + # TODO: All of these outputs can be stored by constituency so we can reduce the amount + # of data we fetch + # + # TODO: WE need to store lighting_averages to a s3 + # We should also extend these averages so they're by more variables (property type, age band, + # constituency, + # etc) + cleaner.clean() + # TODO: cleaner.cleaned datasets to s3 + + # TODO: Add property age band into this + uvalue_estimates = UvalueEstimations(data=data) + uvalue_estimates.get_estimates(cleaner=cleaner) + # TODO: Store these to a s3 diff --git a/model_data/simulation_system/core/Settings.py b/model_data/simulation_system/core/Settings.py index 030747ee..01d4151e 100644 --- a/model_data/simulation_system/core/Settings.py +++ b/model_data/simulation_system/core/Settings.py @@ -53,6 +53,11 @@ DEPLOYMENT_FOLDER = "deployment" TOTAL_FLOOR_AREA_NATIONAL_AVERAGE = 70 FLOOR_HEIGHT_NATIONAL_AVERAGE = 2.45 +AVERAGE_FIXED_FEATURES = [ + "TOTAL_FLOOR_AREA", + "FLOOR_HEIGHT" +] + COLUMNS_TO_MERGE_ON = [ "PROPERTY_TYPE", "BUILT_FORM", diff --git a/model_data/simulation_system/generate_rdsap_change.py b/model_data/simulation_system/generate_rdsap_change.py index 53107df0..ec895408 100644 --- a/model_data/simulation_system/generate_rdsap_change.py +++ b/model_data/simulation_system/generate_rdsap_change.py @@ -9,6 +9,7 @@ from simulation_system.core.Settings import ( RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, COLUMNS_TO_MERGE_ON, + EARLIEST_EPC_DATE ) from simulation_system.core.DataProcessor import DataProcessor from utils import save_dataframe_to_s3_parquet @@ -69,9 +70,6 @@ def app(): property_data[MANDATORY_FIXED_FEATURES].iloc[-1].to_dict() ) - # Taking just the last row, which is the percentage change from the latest to previous one only - # property_data[AVERAGE_FIXED_FEATURES].fillna(value=0).pct_change().iloc[-1] > 0.1 - # Extract the columns that are not all None modified_property_data = DataProcessor.apply_averages_cleaning( data_to_clean=property_data, @@ -143,9 +141,12 @@ def app(): data_by_urpn_df = pd.DataFrame(data_by_urpn) # Add some temporal features - we look at the days from the standard starting point in time # for the starting and ending date so all records are from a fixed point - # TODO: implement me - data_by_urpn_df["DAYS_TO_STARTING"] = None - data_by_urpn_df["DAYS_TO_ENDING"] = None + data_by_urpn_df["DAYS_TO_STARTING"] = ( + pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_STARTING"]) - pd.to_datetime(EARLIEST_EPC_DATE) + ).dt.days + data_by_urpn_df["DAYS_TO_ENDING"] = ( + pd.to_datetime(data_by_urpn_df["LODGEMENT_DATE_ENDING"]) - pd.to_datetime(EARLIEST_EPC_DATE) + ).dt.days # TODO: We need to pre-process the data. For instance, rather than using static for roofs, walls and # floors, we may want to use the U-value. We may also want to handle the (assumed) tags