import msgpack import pandas as pd from datetime import datetime from typing import List from pathlib import Path from tqdm import tqdm import multiprocessing as mp from etl.epc.DataProcessor import EPCDataProcessor from etl.epc.Record import EPCRecord, EPCDifferenceRecord from etl.epc.Dataset import TrainingDataset from utils.s3 import save_dataframe_to_s3_parquet, read_from_s3 from etl.epc.settings import ( MANDATORY_FIXED_FEATURES, LATEST_FIELD, COMPONENT_FEATURES, RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE, CORE_COMPONENT_FEATURES, EFFICIENCY_FEATURES, POTENTIAL_COLUMNS, ROOM_FEATURES, COST_FEATURES, POST_SAP10_FEATURE, ) # TODO: change in setting file MANDATORY_FIXED_FEATURES = [x.lower() for x in MANDATORY_FIXED_FEATURES] # LATEST_FIELD = [x.lower() for x in LATEST_FIELD if x.lower() not in ROOM_FEATURES] LATEST_FIELD = [x.lower() for x in LATEST_FIELD] COMPONENT_FEATURES = [x.lower() for x in COMPONENT_FEATURES] RDSAP_RESPONSE = RDSAP_RESPONSE.lower() HEAT_DEMAND_RESPONSE = HEAT_DEMAND_RESPONSE.lower() CARBON_RESPONSE = CARBON_RESPONSE.lower() CORE_COMPONENT_FEATURES = [x.lower() for x in CORE_COMPONENT_FEATURES] EFFICIENCY_FEATURES = [x.lower() for x in EFFICIENCY_FEATURES] POTENTIAL_COLUMNS = [x.lower() for x in POTENTIAL_COLUMNS] VARIABLE_DATA_FEATURES = ( COMPONENT_FEATURES + ROOM_FEATURES + EFFICIENCY_FEATURES + POTENTIAL_COLUMNS + ["lodgement_date", RDSAP_RESPONSE, HEAT_DEMAND_RESPONSE, CARBON_RESPONSE] ) COST_FEATURES = [x.lower() for x in COST_FEATURES] def get_cleaned_description_mapping(): """ This function will retrieve the cleaned dataset from s3 which has the cleaned descriptions for the epc dataset This data is stored in MessagePack format and therefore needs to be decoded :return: """ cleaned = read_from_s3( s3_file_name="cleaned_epc_data/cleaned.bson", bucket_name="retrofit-data-dev" ) cleaned = msgpack.unpackb(cleaned, raw=False) return cleaned clean_lookup = get_cleaned_description_mapping() # TODO: THIS IS A TEMPORARY FIX new_walls_description_mapping = pd.DataFrame(clean_lookup["walls-description"]) new_walls_description_mapping.loc[ ~new_walls_description_mapping["thermal_transmittance_unit"].isnull(), "thermal_transmittance_unit", ] = "w/m-¦k" clean_lookup["walls-description"] = new_walls_description_mapping.to_dict( orient="records" ) # TODO: THIS IS A TEMPORARY FIX new_mainheatcont_mapping = pd.DataFrame(clean_lookup["mainheatcont-description"]) new_mainheatcont_mapping.loc[ new_mainheatcont_mapping["original_description"] == "SAP:Main-Heating-Controls", [ "thermostatic_control", "charging_system", "switch_system", "no_control", "dhw_control", "community_heating", "multiple_room_thermostats", "auxiliary_systems", "trvs", "rate_control", ], ] = None new_mainheatcont_mapping.loc[ new_mainheatcont_mapping["original_description"] == " ", [ "thermostatic_control", "charging_system", "switch_system", "no_control", "dhw_control", "community_heating", "multiple_room_thermostats", "auxiliary_systems", "trvs", "rate_control", ], ] = None clean_lookup["mainheatcont-description"] = new_mainheatcont_mapping.to_dict( orient="records" ) # TEMP FIX - GRANITE OR WHINSTONE BOOLEAN ISSUE new_walls_description_mapping = pd.DataFrame(clean_lookup["walls-description"]) new_walls_description_mapping.loc[ new_walls_description_mapping["original_description"].str.contains("Granite"), "is_granite_or_whinstone", ] = True clean_lookup["walls-description"] = new_walls_description_mapping.to_dict( orient="records" ) class EPCPipeline: """ This class will take a list of directories and process them to create a dataset: - Load the data - Pre-process the data - Create a dataset - Clean the dataset - Store the dataset """ def __init__( self, epc_data_processor: EPCDataProcessor, api_epc_records: dict = None, directories: List[Path] | None = None, run_mode="training", epc_local_file="certificates.csv", epc_bucket_name="retrofit-data-dev", epc_cleaning_dataset_key="sap_change_model/{}/cleaning_dataset_rooms.parquet", epc_all_equal_rows_key="sap_change_model/{}/all_equal_rows_rooms.parquet", epc_compiled_dataset_key="sap_change_model/{}/dataset_rooms.parquet", use_parallel=False, ): """ :param directories: List of directories to process :param epc_data_processor: EPCDataProcessor object :param run_mode: Either training or newdata :param epc_local_file: Local file name of the EPC data :param epc_bucket_name: S3 bucket name :param epc_cleaning_dataset_key: S3 key for the cleaning dataset :param epc_all_equal_rows_key: S3 key for the all equal rows dataset :param epc_compiled_dataset_key: S3 key for the compiled dataset """ self.compiled_dataset: pd.DataFrame = pd.DataFrame() self.compiled_all_equal_rows: list = [] self.compiled_cleaning_averages: list = [] self.directories = directories self.epc_data_processor = epc_data_processor self.api_epc_records = api_epc_records self.run_mode = run_mode self.epc_local_file = epc_local_file self.epc_bucket_name = epc_bucket_name self.use_parallel = use_parallel self.timeprefix = datetime.now().strftime("%Y-%m-%d-%H-%M-%S") self.epc_cleaning_dataset_key = epc_cleaning_dataset_key.format(self.timeprefix) self.epc_all_equal_rows_key = epc_all_equal_rows_key.format(self.timeprefix) self.epc_compiled_dataset_key = epc_compiled_dataset_key.format(self.timeprefix) def run(self): """ Entrypoint to run the pipeline """ if self.run_mode == "training": self.run_training_dataset_pipeline() elif self.run_mode == "newdata": self.run_newdata_dataset_pipeline() else: raise ValueError("Run mode defined needs to be in 'training' or 'newdata'") def run_newdata_dataset_pipeline(self): """ Main function to run the newdata pipeline """ prepared_epc = EPCRecord( self.api_epc_records, run_mode="newdata" ) # This uses all the epc records to clean the data self.epc_data_processor.insert_data(prepared_epc) self.epc_data_processor.prepare_data() data = self.epc_data_processor.data epc_records = [ EPCRecord(**x, run_mode="newdata") for x in data.to_dict(orient="records") ] def run_training_dataset_pipeline(self): """ Main function to run the training dataset generation pipeline """ if self.directories is None: raise ValueError( "Directories not specified - Unable to run Training pipeline" ) if self.use_parallel: self.run_training_dataset_parallel_pipeline() else: for directory in tqdm(self.directories): self.process_directory(directory) save_dataframe_to_s3_parquet( df=self.compiled_dataset, bucket_name=self.epc_bucket_name, file_key=self.epc_compiled_dataset_key, ) save_dataframe_to_s3_parquet( df=pd.DataFrame(self.compiled_all_equal_rows), bucket_name=self.epc_bucket_name, file_key=self.epc_all_equal_rows_key, ) save_dataframe_to_s3_parquet( df=pd.concat(self.compiled_cleaning_averages), bucket_name=self.epc_bucket_name, file_key=self.epc_cleaning_dataset_key, ) def run_training_dataset_parallel_pipeline(self): """ Run the training pipeline in parallel """ with mp.Pool() as pool: results = list( tqdm( pool.imap(self.process_directory_task, self.directories), total=len(self.directories), ), ) for result in tqdm(results): self.compiled_dataset = pd.concat( [self.compiled_dataset, result["dataset"]] ) self.compiled_cleaning_averages.append(result["cleaning_averages"]) self.compiled_all_equal_rows.extend(result["all_equal_rows"]) def process_directory_task(self, directory: str) -> pd.DataFrame: """ Task to enable parallel processing """ self.process_directory(directory=directory) output = { "dataset": self.compiled_dataset, "cleaning_averages": self.epc_data_processor.cleaning_averages, "all_equal_rows": self.compiled_all_equal_rows, } return output def process_directory(self, directory: Path): """ Process a single directory :param directory: :return: """ filepath = directory / self.epc_local_file self.epc_data_processor.prepare_data(filepath=filepath) constituency_data = self.epc_data_processor.data self.compiled_cleaning_averages.append( self.epc_data_processor.cleaning_averages ) constituency_difference_records = [] for uprn, property_data in constituency_data.groupby("uprn", observed=True): difference_records = self.process_uprn( uprn=str(uprn), property_data=property_data, directory=directory ) if difference_records is not None: constituency_difference_records.extend(difference_records) constituency_dataset = TrainingDataset( datasets=constituency_difference_records, cleaned_lookup=clean_lookup ) self.compiled_dataset = pd.concat( [self.compiled_dataset, constituency_dataset.df] ) def process_uprn(self, uprn: str, property_data: pd.DataFrame, directory: Path): """ Process a single UPRN, which may have multiple different EPCs :param uprn: UPRN :param property_data: pd.DataFrame, Data for a single UPRN :param directory: Path, Directory of the UPRN :return: """ # If a property has changed building type, we can ignore the epc rating i.e. this should be 1 unique row if any(property_data[MANDATORY_FIXED_FEATURES].nunique() > 1) or ( pd.isnull(property_data[MANDATORY_FIXED_FEATURES]).sum().sum() > 0 ): return None # Fixed features - these are property attributes that shouldn't change over time # Take the latest row for both the LATEST_FEILDS and MANDATORY FIELDS and combine all fields together fixed_data = ( property_data[MANDATORY_FIXED_FEATURES + LATEST_FIELD].iloc[-1].to_dict() ) # We include the lodgement date here as we probably need to factor time into the # model, since EPC standards and rigour have changed over time variable_data = property_data[ VARIABLE_DATA_FEATURES + COST_FEATURES + POST_SAP10_FEATURE ] uprn = str(uprn) epc_records = [ EPCRecord(uprn, **x, run_mode="training") for x in variable_data.to_dict(orient="records") ] # TODO: We want to be able to provide value for the u values in the main pipeline so this will need to be # part of the EPCRecord # We can use multiple types of comparison datasets - i.e. Compare consecutive records, or compare all # permutations of records property_difference_records = self._generate_property_difference_records( epc_records, uprn, directory, fixed_data ) return property_difference_records def _generate_property_difference_records( self, epc_records: List[EPCRecord], uprn: str, directory: Path, fixed_data: dict ): """ We can use multiple types of comparison datasets, for example: - First vs second - Second vs third - First vs third :param epc_records: :return: """ property_difference_records: list = [] # property_difference_records = self._compare_consecutive_epcs(epc_records, uprn, directory, fixed_data, # property_difference_records) property_difference_records = self._compare_all_permutation_epcs( epc_records, uprn, directory, fixed_data, property_difference_records ) return property_difference_records def _compare_all_permutation_epcs( self, epc_records: List[EPCRecord], uprn: str, directory: Path, fixed_data: dict, property_difference_records: list, ): """ Compare all permutations of EPCs for a given UPRN :param epc_records: :return: """ for idx in range(0, len(epc_records) - 1): for idx2 in range(idx + 1, len(epc_records)): earliest_record: EPCRecord = epc_records[idx] latest_record: EPCRecord = epc_records[idx2] # Auto sort the records so that the record with highest RDSAP score is always record1 difference_record: EPCDifferenceRecord = ( latest_record.create_epc_difference_record(other=earliest_record, fixed_data=fixed_data) ) # difference_record: EPCDifferenceRecord = latest_record - earliest_record # # TODO: Use method above instead of overloading operator # difference_record.append_fixed_data(fixed_data) # TODO: Pull out RDSAP_CHANGE to a variable if difference_record.get("rdsap_change") == 0: if not difference_record.ensure_adequate_data(): # Rdsap hasn't changed but we have enough data to use this record # i.e. all fields aside from mechnical ventilation are the same] # self.check_records.append({"uprn": uprn, "directory_name": directory.name, # "difference_record": difference_record, "earliest_record": earliest_record, # "latest_record": latest_record}) continue all_equal = difference_record.compare_fields_in_records( fields=[x.lower() for x in CORE_COMPONENT_FEATURES] ) if all_equal: # Keep track of this for the moment so we can analyse self.compiled_all_equal_rows.append( {"uprn": uprn, "directory_name": directory.name} ) continue property_difference_records.append(difference_record) return property_difference_records def _compare_consecutive_epcs( self, epc_records: List[EPCRecord], uprn: str, directory: Path, fixed_data: dict, property_difference_records: list, ): """ Compare consecutive EPCs for a given UPRN :param epc_records: :return: """ for idx in range(0, len(epc_records) - 1): if idx >= len(epc_records) - 1: break earliest_record: EPCRecord = epc_records[idx] latest_record: EPCRecord = epc_records[idx + 1] # Auto sort the records so that the record with highest RDSAP score is always record1 difference_record: EPCDifferenceRecord = latest_record - earliest_record # TODO: Use method above instead of overloading operator difference_record.append_fixed_data(fixed_data) # TODO: Pull out RDSAP_CHANGE to a variable if difference_record.get("rdsap_change") == 0: if not difference_record.ensure_adequate_data(): # Rdsap hasn't changed but we have enough data to use this record # i.e. all fields aside from mechnical ventilation are the same] # self.check_records.append({"uprn": uprn, "directory_name": directory.name, "difference_record": # difference_record, "earliest_record": earliest_record, "latest_record": latest_record}) continue all_equal = difference_record.compare_fields_in_records( fields=[x.lower() for x in CORE_COMPONENT_FEATURES] ) if all_equal: # Keep track of this for the moment so we can analyse self.compiled_all_equal_rows.append( {"uprn": uprn, "directory_name": directory.name} ) continue # difference_record.append_fixed_data(fixed_data) property_difference_records.append(difference_record) return property_difference_records