diff --git a/etl/epc/DataProcessor.py b/etl/epc/DataProcessor.py index 0587fdbe..8adac8df 100644 --- a/etl/epc/DataProcessor.py +++ b/etl/epc/DataProcessor.py @@ -5,6 +5,9 @@ from BaseUtility import Definitions from etl.epc.settings import ( DATA_PROCESSOR_SETTINGS, EARLIEST_EPC_DATE, + IGNORED_TRANSACTION_TYPES, + IGNORED_FLOOR_LEVELS, + IGNORED_PROPERTY_TYPES, FULLY_GLAZED_DESCRIPTIONS, AVERAGE_FIXED_FEATURES, BUILT_FORM_REMAP, @@ -416,9 +419,9 @@ class DataProcessor: self.data = self.data[~pd.isnull(self.data["UPRN"])] self.data = self.data[self.data["LODGEMENT_DATE"] >= EARLIEST_EPC_DATE] - self.data = self.data[self.data["TRANSACTION_TYPE"] != "new dwelling"] + self.data = self.data[self.data["TRANSACTION_TYPE"] != IGNORED_TRANSACTION_TYPES] self.data = self.data[ - ~self.data["FLOOR_LEVEL"].isin(["top floor", "mid floor"]) + ~self.data["FLOOR_LEVEL"].isin(IGNORED_FLOOR_LEVELS) ] self.data = self.data[self.data[RDSAP_RESPONSE] <= MAX_SAP_SCORE] @@ -430,7 +433,7 @@ class DataProcessor: # Because park homes are surveyed unusually (for example, we don't have u-values to # look up for their different components, they need to be collected in survey and aren't reflected in # EPCs) we'll ignore them from the model - self.data = self.data[self.data["PROPERTY_TYPE"] != "Park home"] + self.data = self.data[self.data["PROPERTY_TYPE"] != IGNORED_PROPERTY_TYPES] def clean_multi_glaze_proportion(self) -> None: """ diff --git a/etl/epc/property_change_app.py b/etl/epc/property_change_app.py index 4f49f6da..18228cb2 100644 --- a/etl/epc/property_change_app.py +++ b/etl/epc/property_change_app.py @@ -28,7 +28,7 @@ from recommendations.recommendation_utils import ( DATA_DIRECTORY = Path(__file__).parent / "local_data" / "all-domestic-certificates" -def get_cleaned(): +def get_cleaned_description_mapping(): """ This function will retrieve the cleaned dataset from s3 which has the cleaned descriptions for the epc dataset @@ -404,7 +404,7 @@ def app(): # Data glossary: # https://epc.opendatacommunities.org/docs/guidance#glossary - cleaned_lookup = get_cleaned() + cleaned_lookup = get_cleaned_description_mapping() # List all subdirectories directories = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()] diff --git a/etl/epc/requirements.txt b/etl/epc/requirements.txt index e69de29b..9f972bde 100644 --- a/etl/epc/requirements.txt +++ b/etl/epc/requirements.txt @@ -0,0 +1,4 @@ +pandas==2.1.3 +tqdm==4.66.1 +msgpack==1.0.7 +boto3==1.29.6 \ No newline at end of file diff --git a/etl/epc/settings.py b/etl/epc/settings.py index 60c079a5..eb8eb641 100644 --- a/etl/epc/settings.py +++ b/etl/epc/settings.py @@ -155,6 +155,10 @@ MANDATORY_FIXED_FEATURES = ["PROPERTY_TYPE", "BUILT_FORM", "CONSTITUENCY"] # and Wales from 31 July 2014 EARLIEST_EPC_DATE = "2014-08-01" +IGNORED_TRANSACTION_TYPES = "new dwelling" +IGNORED_FLOOR_LEVELS = ["top floor", "mid floor"] +IGNORED_PROPERTY_TYPES = "Park home" + RDSAP_RESPONSE = "CURRENT_ENERGY_EFFICIENCY" HEAT_DEMAND_RESPONSE = "ENERGY_CONSUMPTION_CURRENT" CARBON_RESPONSE = "CO2_EMISSIONS_CURRENT"