import hashlib import os import re import tiktoken from pprint import pprint from datetime import datetime from openai import OpenAI import numpy as np import pandas as pd from fuzzywuzzy import process from utils.logger import setup_logger from backend.SearchEpc import SearchEpc from BaseUtility import Definitions import asset_list.mappings.property_type as property_type_mappings import asset_list.mappings.walls as walls_mappings import asset_list.mappings.heating_systems as heating_mappings import asset_list.mappings.exising_pv as existing_pv_mappings from recommendations.recommendation_utils import ( estimate_perimeter, estimate_external_wall_area, estimate_number_of_floors ) from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes logger = setup_logger() # OpenAI API Key (set this in your environment variables for security) OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") class DataRemapper: def __init__(self, standard_values, standard_map=None, max_tokens=1000): """ Initialize the remapper with standard values and a predefined mapping. :param standard_values: Set of allowed standardized values. :param standard_map: Dictionary of common remappings {raw_value: standard_value}. """ self.standard_values = standard_values self.standard_map = standard_map self.fuzzy_threshold = 90 # Adjust fuzzy matching sensitivity self.ai_model = "gpt-4-turbo" # Use gpt-3.5-turbo for cheaper processing # Tokenizer for counting tokens self.tokenizer = tiktoken.encoding_for_model(self.ai_model) # Track token usage and remap dictionary self.total_tokens_used = 0 self.total_cost = 0 self.remap_dict = {} # {original_value: standardized_value} self.max_tokens = max_tokens # Limit for OpenAI API # Memoization for AI calls self.ai_cache = {} # {tuple(unmapped_values): {original_value: standardized_value}} # Capture the reponse for debugging self.ai_response = None # OpenAI pricing (as of Feb 2024) self.pricing = { "gpt-4-turbo": {"input": 0.01 / 1000, "output": 0.03 / 1000}, "gpt-3.5-turbo": {"input": 0.0015 / 1000, "output": 0.002 / 1000}, } self.openai_client = OpenAI(api_key=OPENAI_API_KEY) @staticmethod def clean_string(text): """Basic text cleaning: remove extra spaces, punctuation, and normalize case.""" if not isinstance(text, str): return None text = text.strip().lower() text = re.sub(r'[^\w\s]', '', text) # Remove punctuation # Replace double strings text = re.sub(r'\s+', ' ', text) return text def fuzzy_match(self, text): """Use fuzzy matching to find the closest standard value.""" match, score = process.extractOne(text, self.standard_values) if text else (None, 0) return match if score >= self.fuzzy_threshold else None def count_tokens(self, text): """Estimate the number of tokens in a given text.""" return len(self.tokenizer.encode(text)) if text else 0 def ai_standardize(self, unmapped_values): """Call OpenAI API **once** for all unmapped values to minimize cost, with memoization.""" if not unmapped_values: return {} unmapped_tuple = tuple(sorted(unmapped_values)) # Ensure consistency for memoization if unmapped_tuple in self.ai_cache: return self.ai_cache[unmapped_tuple] # Return memoized result prompt = f""" You are an expert in data classification. Standardize each of these values into one of the categories: {list(self.standard_values)}. Return only a JSON dictionary where: - The keys are the original values. - The values are the standardized ones. Strictly return JSON **without markdown formatting** or extra text. Example Output: {{ "BLKHOUS": "block house", "BEDSIT": "bedsit" }} Values to standardize: {unmapped_values} """ # Count input tokens input_tokens = self.count_tokens(prompt) if input_tokens > self.max_tokens: raise ValueError("Input tokens exceed the maximum limit.") logger.info("Calling OpenAI API for standardization...") response = self.openai_client.chat.completions.create( model=self.ai_model, messages=[{"role": "user", "content": prompt}], max_tokens=self.max_tokens, temperature=0.1, ) output_text = response.choices[0].message.content.strip() output_tokens = self.count_tokens(output_text) # Count output tokens # Track total token usage self.total_tokens_used += input_tokens + output_tokens # Estimate cost input_cost = input_tokens * self.pricing[self.ai_model]["input"] output_cost = output_tokens * self.pricing[self.ai_model]["output"] self.total_cost += input_cost + output_cost try: # Parse response as dictionary mapping = eval(output_text) # OpenAI should return a valid dictionary except: mapping = {val: "unknown" for val in unmapped_values} # Fallback # Memoize the AI response self.ai_cache[unmapped_tuple] = mapping # We store the raw AI response for debugging logger.debug(f"AI Response: {mapping}") self.ai_response = output_text return mapping def standardize_list(self, values_to_remap): """ Standardizes a list of values and returns a dictionary {original_value: standardized_value}. :param values_to_remap: List of raw values to standardize. :return: Dictionary {original_value: standardized_value}. """ unique_values = set(values_to_remap) # Process only unique values unmapped_values = [] for value in unique_values: if pd.isna(value): # Handle NaN values self.remap_dict[value] = "unknown" continue cleaned_value = self.clean_string(value) # Rule-Based Check (Predefined Mapping) if cleaned_value in self.standard_map or value in self.standard_map: self.remap_dict[value] = ( self.standard_map[cleaned_value] if cleaned_value in self.standard_map else self.standard_map[value] ) continue if value.lower() in self.standard_map: self.remap_dict[value] = self.standard_map[value.lower()] continue # Exact Match in Standard Values if cleaned_value in self.standard_values: self.remap_dict[value] = cleaned_value continue # Fuzzy Matching fuzzy_match = self.fuzzy_match(cleaned_value) if fuzzy_match: self.remap_dict[value] = fuzzy_match continue # Capture anything that wasn't mapped unmapped_values.append(value) # AI Model - remap anything unmapped (batch request) ai_mapping = self.ai_standardize(unmapped_values) self.remap_dict.update(ai_mapping) return self.remap_dict def report_usage(self): """Prints a summary of token usage and cost.""" print(f"\n🔹 Total Tokens Used: {self.total_tokens_used}") print(f"💰 Estimated Cost: ${self.total_cost:.4f}") class AssetList: """ This class is used to standardise asset lists so that we can process the core information in a consistent manner. """ EPC_API_DATA_NAMES = { "uprn": "epc_os_uprn", "address1": "epc_address1", "address": "epc_address", "postcode": "epc_postcode", "inspection-date": "epc_inspection_date", "current-energy-efficiency": "epc_sap_score_on_register", "current-energy-rating": "epc_rating_on_register", "property-type": "epc_property_type", "built-form": "epc_archetype", "total-floor-area": "epc_total_floor_area", "construction-age-band": "epc_age_band", "floor-height": "epc_floor_height", "number-habitable-rooms": "epc_number_habitable_rooms", "walls-description": "epc_wall_construction", "roof-description": "epc_roof_construction", "floor-description": "epc_floor_construction", "mainheat-description": "epc_heating_type", 'mainheatcont-description': "epc_heating_controls", "secondheat-description": "epc_secondary_heating", "transaction-type": "epc_reason", "energy-consumption-current": "epc_heat_demand", "photo-supply": "epc_photo_supply" } FIND_EPC_DATA_NAMES = { "heating_text": "epc_estiamted_heating_kwh", "hot_water_text": "epc_estimated_hotwater_kwh", 'Assessor’s name': "epc_assessor_name", "Assessor's Telephone": "epc_assessor_telephone", "Assessor's Email": "epc_assessor_email", "Accreditation scheme": "epc_assessor_accreditation", "Assessor’s ID": "epc_assessor_id", "Solar photovoltaics": "epc_solar_pv" } DATETIME_REMAP = { "Pre 1900": datetime(year=1899, month=12, day=31), } # These are the accepted methods we have for cleaning the address1 column ADDRESS_1_CLEANING_METHODS = [ "first_two_words", # This method will split on the fist two words, where the separator is a space "first_word", # This method will split on the first word, where the separator is a space "house_number_extraction", # This method will use the NLP model in SearchEPC to extract the housenumber # "address1_extraction" # This method will use the NLP model to extract address1 ] # Standard column Names STANDARD_ADDRESS_1 = "domna_address_1" STANDARD_POSTCODE = "domna_postcode" STANDARD_FULL_ADDRESS = "domna_full_address" STANDARD_YEAR_BUILT = "landlord_year_built" STANDARD_UPRN = "ordnance_survey_uprn" STANDARD_LANDLORD_PROPERTY_ID = "landlord_property_id" STANDARD_PROPERTY_TYPE = "landlord_property_type" STANDARD_WALL_CONSTRUCTION = "landlord_wall_construction" STANDARD_HEATING_SYSTEM = "landlord_heating_system" STANDARD_EXISTING_PV = "landlord_existing_pv" DOMNA_PROPERTY_ID = "domna_property_id" # Regular expression for identifying if the address might point to multiple units MULTI_UNIT_REGEX = re.compile(r'\b([A-Za-z0-9]+)-([A-Za-z0-9]+)\b') # List of columns relating to the non-intrusive data NON_INTRUSIVES_COLNAMES = [ "Archetype", "Construction", "Insulated", "Material", "CIGA Check Required", "PV, ACCESS ISSUE, SEE NOTES", "OFF GAS - ROOF ORIENTATION", "Any further surveyor notes", 'Surveyors Name' ] # This SAP threshold is a key search criteria for properties that may be eligible for extraction FILLED_CAVITY_SAP_THRESHOLD = 75 # This SAP the EMPTY_CAVITY_SAP_THRESHOLD = 71 # Any EPC deemed to have been conducted prior to this year is deemed to be unreliable EPC_YEAR_THRESHOLD = pd.Timestamp.now().year - 5 # Attributes - these are columns that we produce, calcualted based on other pieces of data ATTRIBUTE_HAS_SOLAR = "attribute_has_solar" ATTRIBUTE_NUMBER_OF_FLOORS = "attribute_est_number_floors" ATTRIBUTE_ESTIMATED_PERIMETER = "attribute_est_perimter" ATTRIBUTE_HEAT_LOSS_AREA = "attribute_heat_loss_area" ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS = "attribute_epc_roof_insulation_thickness" ATTRIBUTE_SAP_THRESHOLD_AND_BELOW = f"sap_rating_{FILLED_CAVITY_SAP_THRESHOLD}_and_below" ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD = f"EPC is pre {EPC_YEAR_THRESHOLD}" # These are the descriptions that we look for in the EPC data that are indicative of no insulation EPC_NO_WALL_INSULATION_DESCRIPTIONS = [ "cavity wall, as built, no insulation (assumed)", "cavity wall, as built, partial insulation (assumed)", "cavity wall, as built, partial insulation", "cavity wall, as built, no insulation", ] # List of strings that we look for in the EPC data, where substrings indicate that the wall is insulated EPC_INSULATED_WALLS_SUBSTRINGS = [ ", insulated", "with external insulation", "with internal insulation", "filled cavity" ] # List of strings that we look for in the EPC data, where substrings indicate that the roof is insulated EPC_INSULATED_ROOF_SUBSTRINGS = [ "(another dwelling above)", ", insulated", ", insulated (assumed) ", ", ceiling insulated", ] def __init__( self, local_filepath, sheet_name, address1_colname, postcode_colname, full_address_colname, landlord_property_id=None, full_address_cols_to_concat=None, missing_postcodes_method=None, address1_extraction_method=None, landlord_year_built=None, landlord_uprn=None, landlord_property_type=None, landlord_wall_construction=None, landlord_heating_system=None, landlord_existing_pv=None, header=0 ): self.local_filepath = local_filepath self.sheet_name = sheet_name # Read in the data self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name) self.standardised_asset_list = self.raw_asset_list.copy() # We detect the presence of the non-intrusive columns self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False # Names of columns self.landlord_property_id = landlord_property_id self.address1_colname = address1_colname self.postcode_colname = postcode_colname self.full_address_colname = full_address_colname self.landlord_year_built = landlord_year_built self.landlord_uprn = landlord_uprn self.landlord_property_type = landlord_property_type self.landlord_wall_construction = landlord_wall_construction self.landlord_heating_system = landlord_heating_system self.landlord_existing_pv = landlord_existing_pv # parameters for cleaning self.full_address_cols_to_concat = full_address_cols_to_concat self.missing_postcodes_method = missing_postcodes_method self.address1_extraction_method = address1_extraction_method self.debug_information = { "property_type": None, "wall_construction": None, "heating_system": None, "existing_pv": None } self.variable_mappings = {} self.rename_map = {} self.keep_variables = [] # Finally, we handle the case where the landlord's property ID is actually the OS UPRN if self.landlord_uprn == self.landlord_property_id: self.standardised_asset_list[self.STANDARD_UPRN] = self.standardised_asset_list[self.landlord_uprn].copy() # Update the reference to landlord UPRn self.landlord_uprn = self.STANDARD_UPRN def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"): if method not in self.ADDRESS_1_CLEANING_METHODS: raise ValueError(f"Method {method} for producing address1 not recognized") if method == "first_two_words": asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") return asset_list if method == "first_word": asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[0] return asset_list if method == "house_number_extraction": asset_list[self.address1_colname] = asset_list.apply( lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]), axis=1 ) return asset_list raise ValueError(f"Method {method} not recognized") @staticmethod def _address1_extraction(x): pass def create_property_id(self): """ This function creates the domna property ID, which is simply a hash of the full address and postcode We want all figures to be positive :return: """ # We'll remove punctuation and whitespace from the address, before hashing to produce an ID def _make_hash(value): """Generates a stable SHA256 hash suffix and appends it to a cleaned version of the value.""" # Normalize and remove special characters for cleaner ID cleaned_value = re.sub(r"[^\w\s-]", "", value).replace(" ", "_").lower() # Generate SHA-256 hash and truncate it short_hash = hashlib.sha256(value.encode()).hexdigest()[:12] return f"{cleaned_value}-{short_hash}" # Apply transformation self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = ( self.standardised_asset_list[self.full_address_colname] + self.standardised_asset_list[self.postcode_colname] ).str.strip().str.replace(r"[^\w\s]", "", regex=True).str.replace(" ", "").str.lower().apply(_make_hash) @staticmethod def _strip_postcode_from_full_address(full_address, postcode): cleaned = full_address.replace(postcode, "") # Remove any trailing commas and spaces cleaned = cleaned.rstrip(", ").strip(",").strip() return cleaned @classmethod def _identify_multi_address(cls, address): # We check if the address is comma separated if "," in address: address1_section = address.split(",")[0] # We look for string in the form (x-y) return bool(cls.MULTI_UNIT_REGEX.search(address1_section)) @staticmethod def _convert_uprn(x): """ Used to convert UPRNS to integer strings :param x: uprn to convert :return: converted uprn """ if pd.isnull(x): return x # check if numeric if np.isreal(x): return str(int(x)) if str(x).isdigit(): return str(int(x)) return x def init_standardise(self): """ This function is used to standardise the asset list :return: standardised asset list """ # Remove rows without a postcode if self.postcode_colname is not None: self.standardised_asset_list = self.standardised_asset_list.dropna(subset=[self.postcode_colname]) # We clean up portential non-breaking spaces, and double spaces for col in [ c for c in [self.postcode_colname, self.full_address_colname, self.address1_colname] if c is not None ]: self.standardised_asset_list[col] = self.standardised_asset_list[col].astype(str) self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace('\xa0', ' ', regex=False) self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace(' ', ' ', regex=False) if self.address1_colname is None: if self.address1_extraction_method is None: raise ValueError("Missing address 1 - please specify an extraction method") self.address1_colname = self.STANDARD_ADDRESS_1 # If we do not have this, we produce it self.standardised_asset_list = self._extract_address1( asset_list=self.standardised_asset_list, full_address_col=self.full_address_colname, postcode_col=self.postcode_colname, method=self.address1_extraction_method ) if self.full_address_colname is None: if not self.full_address_cols_to_concat: raise ValueError("Missing full address - please specify columns to concatenate") self.full_address_colname = self.STANDARD_FULL_ADDRESS self.standardised_asset_list[self.full_address_colname] = ( self.standardised_asset_list[self.full_address_cols_to_concat].apply(lambda x: ", ".join(x), axis=1) ) else: # Make sure to strip the postcode out of the full address self.standardised_asset_list[self.full_address_colname] = self.standardised_asset_list.apply( lambda x: self._strip_postcode_from_full_address( full_address=x[self.full_address_colname], postcode=x[self.postcode_colname] ), axis=1 ) # We create the domna property id self.create_property_id() # Clean up the UPRN column, if the landlord has provided them if self.landlord_uprn is not None: self.standardised_asset_list[self.landlord_uprn] = ( self.standardised_asset_list[self.landlord_uprn].apply(self._convert_uprn) ) # We keep just the columns we care about and will work through the various columns and standardise variables = [ self.landlord_property_id, self.DOMNA_PROPERTY_ID, self.address1_colname, self.postcode_colname, self.full_address_colname, self.landlord_uprn, self.landlord_property_type, self.landlord_year_built, self.landlord_wall_construction, self.landlord_heating_system, self.landlord_existing_pv ] # Keep just non-null variables (e.g landlord may not provide uprn self.keep_variables = [v for v in variables if v is not None] self.rename_map = { self.landlord_property_id: self.STANDARD_LANDLORD_PROPERTY_ID, self.address1_colname: self.STANDARD_ADDRESS_1, self.postcode_colname: self.STANDARD_POSTCODE, self.full_address_colname: self.STANDARD_FULL_ADDRESS, self.landlord_uprn: self.STANDARD_UPRN, self.landlord_property_type: self.STANDARD_PROPERTY_TYPE, self.landlord_year_built: self.STANDARD_YEAR_BUILT, self.landlord_wall_construction: self.STANDARD_WALL_CONSTRUCTION, self.landlord_heating_system: self.STANDARD_HEATING_SYSTEM, self.landlord_existing_pv: self.STANDARD_EXISTING_PV } self.rename_map = {k: v for k, v in self.rename_map.items() if k is not None} if self.non_intrusives_present: self.keep_variables += self.NON_INTRUSIVES_COLNAMES self.rename_map = { **self.rename_map, **dict( zip(self.NON_INTRUSIVES_COLNAMES, ["non-intrusives: " + c for c in self.NON_INTRUSIVES_COLNAMES]) ) } # We idenfiy addresses which are likely to be multi-addresses (i.g are rooms x-y) self.standardised_asset_list["is_multi_address"] = self.standardised_asset_list[ self.full_address_colname ].apply(lambda x: self._identify_multi_address(x)) # We handle cleaning for walls, in the instance that the landlord provides us with EPC data and # we see instances of "average thermal transmittance" in the description self.standardised_asset_list[self.landlord_wall_construction] = np.where( self.standardised_asset_list[self.landlord_wall_construction].str.lower().str.contains( "average thermal transmittance" ), "new build - average thermal transmittance", self.standardised_asset_list[self.landlord_wall_construction] ) # Clear our build year column # We attempt to process the year built column if self.landlord_year_built is not None: # We check if we have a datetime - year built has not been renamed if isinstance(self.standardised_asset_list[self.landlord_year_built].iloc[0], datetime): # We treat any string columns - with common values we see self.standardised_asset_list[self.landlord_year_built] = ( self.standardised_asset_list[self.landlord_year_built].replace(self.DATETIME_REMAP) ) self.standardised_asset_list[self.landlord_year_built] = pd.to_datetime( self.standardised_asset_list[self.landlord_year_built] ) # Convert this to year self.standardised_asset_list[self.landlord_year_built] = ( self.standardised_asset_list[self.landlord_year_built].dt.year ) else: raise NotImplementedError("Year built column must be a datetime - implement me") # We now create standard lookups to_remap = { self.landlord_property_type: { "standard_values": property_type_mappings.STANDARD_PROPERTY_TYPES, "standard_map": property_type_mappings.PROPERTY_MAPPING }, self.landlord_wall_construction: { "standard_values": walls_mappings.STANDARD_WALL_CONSTRUCTIONS, "standard_map": walls_mappings.WALL_CONSTRUCTION_MAPPINGS }, self.landlord_heating_system: { "standard_values": heating_mappings.STANDARD_HEATING_SYSTEMS, "standard_map": heating_mappings.HEATING_MAPPINGS }, self.landlord_existing_pv: { "standard_values": existing_pv_mappings.STANDARD_EXISTING_PV, "standard_map": existing_pv_mappings.EXISTING_PV_MAPPINGS } } for variable, config in to_remap.items(): logger.info("Standardising variable: %s", variable) values_to_remap = self.standardised_asset_list[variable].unique() # We want to map this to our standardised list of property types we're interested in remapper = DataRemapper(standard_values=config["standard_values"], standard_map=config["standard_map"]) remap_dictionary = remapper.standardize_list(values_to_remap=values_to_remap.tolist()) self.variable_mappings[variable] = remap_dictionary # We now print out the variable mappings, which can be reviewed by the user, before the final standardised # asset list is returned for variable, mapping in self.variable_mappings.items(): pprint(f"Variable: {variable}") pprint(mapping) # Print a space print("\n") pprint("=======================================") def apply_standardiation(self, override_empty_mappings=False): """ This function applies the standardisation to the asset list :param override_empty_mappings: If true, will override the check for empty mappings. This is only relevant if there are no categories which need remapping which is highly unlikely :return: """ if not self.variable_mappings and not override_empty_mappings: raise ValueError("Please run init_standardise first") logger.info("Applying standardisation to asset list") for variable, mapping in self.variable_mappings.items(): self.standardised_asset_list[variable] = self.standardised_asset_list[variable].map(mapping) if self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum(): # Drop the dupes pprint( f"There are {self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum()} duplicated " f"addresses - dropping" ) self.standardised_asset_list = self.standardised_asset_list[ ~self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated() ] # Apply renames to our standard names # Perform final variable selection and renaming: self.standardised_asset_list = self.standardised_asset_list[self.keep_variables].rename( columns=self.rename_map ) def merge_data(self, df: pd.DataFrame): """ Used to insert data into the standardised asset list, based on the domna property id :return: """ if self.DOMNA_PROPERTY_ID not in df.columns: raise ValueError(f"Dataframe must contain the column {self.DOMNA_PROPERTY_ID}") if df[self.DOMNA_PROPERTY_ID].duplicated().sum(): raise ValueError(f"{self.DOMNA_PROPERTY_ID} contains duplicated IDs") self.standardised_asset_list = self.standardised_asset_list.merge( df, how="left", on=self.DOMNA_PROPERTY_ID ) def extract_attributes(self): # Used to extracty the typical attributes that we use to identify viable work self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR] = ( self.standardised_asset_list[self.FIND_EPC_DATA_NAMES["Solar photovoltaics"]] | ~self.standardised_asset_list[self.EPC_API_DATA_NAMES["photo-supply"]].isin(["0.0", 0, None, ""]) ) accepted_epc_property_types = ["House", "Flat", "Bungalow", "Maisonette"] # The logic here is: # 1) Take the property type provided by the HA themselves # 2) In absence of that, take the EPC property type # 3) Otherwise use None self.standardised_asset_list[self.ATTRIBUTE_NUMBER_OF_FLOORS] = self.standardised_asset_list.apply( lambda x: estimate_number_of_floors( property_type=( x[self.STANDARD_PROPERTY_TYPE].title() if x[self.STANDARD_PROPERTY_TYPE].title() in accepted_epc_property_types else ( x[self.EPC_API_DATA_NAMES["property-type"]] if not pd.isnull(x[self.EPC_API_DATA_NAMES["property-type"]]) else None ) ) ), axis=1 ) self.standardised_asset_list[self.EPC_API_DATA_NAMES["total-floor-area"]] = ( self.standardised_asset_list[self.EPC_API_DATA_NAMES["total-floor-area"]].astype(float) ) # Replace "" value with None self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] = ( self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]].replace("", None) ) self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] = ( self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]].astype(float) ) # Estimate the perimeter self.standardised_asset_list[self.ATTRIBUTE_ESTIMATED_PERIMETER] = self.standardised_asset_list.apply( lambda x: estimate_perimeter( floor_area=x[self.EPC_API_DATA_NAMES["total-floor-area"]] / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], num_rooms=x[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], ), axis=1 ) self.standardised_asset_list[self.ATTRIBUTE_HEAT_LOSS_AREA] = self.standardised_asset_list.apply( lambda x: estimate_external_wall_area( num_floors=x[self.ATTRIBUTE_NUMBER_OF_FLOORS], floor_height=( float(x[self.EPC_API_DATA_NAMES["floor-height"]]) if x[self.EPC_API_DATA_NAMES["floor-height"]] else 2.5 ), perimeter=x[self.ATTRIBUTE_ESTIMATED_PERIMETER], built_form=x[self.EPC_API_DATA_NAMES["built-form"]] ), axis=1 ) self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = self.standardised_asset_list.apply( lambda x: RoofAttributes(description=x[self.EPC_API_DATA_NAMES["roof-description"]]).process()[ "insulation_thickness"] if not pd.isnull( x[self.EPC_API_DATA_NAMES["roof-description"]]) else None, axis=1 ) # We produce some additional fields # 1) Is the SAP rating below C75 self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] = ( self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= self.FILLED_CAVITY_SAP_THRESHOLD ) # 2) Flag anything where the EPC is older than 5 years self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] = ( pd.to_datetime( self.standardised_asset_list[self.EPC_API_DATA_NAMES["inspection-date"]] ).dt.year < self.EPC_YEAR_THRESHOLD ) self.process_age_band() def process_age_band(self): processed_age_band = [] for _, x in self.standardised_asset_list.iterrows(): if pd.isnull(x[self.EPC_API_DATA_NAMES["construction-age-band"]]) or ( x[self.EPC_API_DATA_NAMES["construction-age-band"]] in Definitions.DATA_ANOMALY_MATCHES ): processed_age_band.append( { self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], "epc_year_lower_bound": None, "epc_year_upper_bound": None, "Does Age Match EPC Age Band?": "No EPC Age Band" } ) continue # We exatract the upper and lower bounds if x[self.EPC_API_DATA_NAMES["construction-age-band"]] in [ "England and Wales: 2007 onwards", "England and Wales: 2012 onwards" ]: year_lower_bound = 2007 if x[self.EPC_API_DATA_NAMES[ "construction-age-band"]] == "England and Wales: 2007 onwards" else 2012 if pd.isnull(x[self.STANDARD_YEAR_BUILT]): age_band_matches = "No Year Built From Landlord" else: age_band_matches = ( "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] >= year_lower_bound else "EPC Age Band is older than Year Built" ) processed_age_band.append( { self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], "epc_year_lower_bound": year_lower_bound, "epc_year_upper_bound": None, "Does Age Match EPC Age Band?": age_band_matches } ) continue if x[self.EPC_API_DATA_NAMES["construction-age-band"]] == "England and Wales: before 1900": if pd.isnull(x[self.STANDARD_YEAR_BUILT]): age_band_matches = "No Year Built From Landlord" else: age_band_matches = ( "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] < 1900 else "EPC Age Band is newer than Year Built" ) processed_age_band.append( { self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], "epc_year_lower_bound": None, "epc_year_upper_bound": 1899, "Does Age Match EPC Age Band?": age_band_matches } ) continue if x[self.EPC_API_DATA_NAMES["construction-age-band"]].isdigit(): if pd.isnull(x[self.STANDARD_YEAR_BUILT]): age_band_matches = "No Year Built From Landlord" else: age_band_matches = ( "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] == int( x[self.EPC_API_DATA_NAMES["construction-age-band"]] ) else "EPC Age Band is different from Year Built" ) processed_age_band.append( { self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], "epc_year_lower_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]), "epc_year_upper_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]), "Does Age Match EPC Age Band?": age_band_matches } ) continue # Oherwise, we extract the upper and lower bounds age_band = x[self.EPC_API_DATA_NAMES["construction-age-band"]].split(": ")[1] lower_date, upper_date = age_band.split("-") age_band_matches = ( "EPC Age Band Matches Year Built" if (x[self.STANDARD_YEAR_BUILT] >= float(lower_date)) and ( x[self.STANDARD_YEAR_BUILT] <= float(upper_date) ) else "EPC Age Band is older than Year Built" if x[self.STANDARD_YEAR_BUILT] > float(upper_date) else "EPC Age Band is newer than Year Built" ) processed_age_band.append( { self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], "epc_year_lower_bound": int(lower_date), "epc_year_upper_bound": int(upper_date), "Does Age Match EPC Age Band?": age_band_matches } ) processed_age_band = pd.DataFrame(processed_age_band) self.standardised_asset_list = self.standardised_asset_list.merge( processed_age_band, how="left" ) def identify_worktypes(self, cleaned): if not self.non_intrusives_present: raise NotImplementedError("Need to implement the case for non-intrusives") # If we have non-intrusives completed, we can use this to identify work types if self.non_intrusives_present: ###################################################### # Empty cavity: ###################################################### # 1) Has been flagged on the non-intrusives as being a cavity wall, empty or partially filled # 2) The age is before 1995 # TODO: 3) Remove anything that likley has access issues self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = ( (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") & self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) & (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2000) ) self.standardised_asset_list["epc_indicates_empty_cavity"] = ( self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin( self.EPC_NO_WALL_INSULATION_DESCRIPTIONS ) & ( self.standardised_asset_list["epc_year_upper_bound"] <= 1995 ) & ( ~self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] ) & ( self.standardised_asset_list[ self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= self.EMPTY_CAVITY_SAP_THRESHOLD ) ) self.standardised_asset_list["empty_cavity"] = ( self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] | self.standardised_asset_list["epc_indicates_empty_cavity"] ) # We add a reason self.standardised_asset_list["empty_cavity_reason"] = np.where( self.standardised_asset_list["non_intrusive_indicates_empty_cavity"], "Non-Intrusive Data", "EPC Data" ) ###################################################### # Extraction ###################################################### # TODO When filterting like this, 627 properties are flagged as not needing a CIGA check and 582 are flagged # as needing a CIGA check. What is the logic we should be applying here? self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") & (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) & (~self.standardised_asset_list['non-intrusives: Material'].isin(["GREY LOOSE BEAD", "FORMALDEHYDE"]) ) & ( self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] ) ) ###################################################### # Solar ###################################################### # Criteria: # TODO: Standardise these columns with our cleaned_data object # Check 1: Does the property have a valid heating system? self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] = ( self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( ["air source heat pump", "ground source heat pump", "high heat retention storage heaters"] ) ) self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] = ( ( self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]] .str.lower().str.contains("air source heat pump|ground source heat pump") ) | ( self.standardised_asset_list[ self.EPC_API_DATA_NAMES["mainheat-description"]].str.lower().str.contains( "electric storage heaters" ) & ( self.standardised_asset_list[self.EPC_API_DATA_NAMES[ "mainheatcont-description"]] == "Controls for high heat retention storage heaters" ) ) ) # Check 2: Does the property have solar already self.standardised_asset_list["property_has_solar"] = ( (self.standardised_asset_list[self.STANDARD_EXISTING_PV] == "already has PV") | (self.standardised_asset_list["non-intrusives: PV, ACCESS ISSUE, SEE NOTES"] == "SOLAR PV ON ROOF") | (self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR]) ) # Check 3: Does the property meet the fabric condition # Solar PV installs are subject to the minimum insulation requirements which means: # 1) one of the following insulation measures must be installed as part of the same # ECO4 project: # • roof insulation (flat roof, pitched roof, room-in-roof) # • exterior facing wall insulation (cavity wall, solid wall) # • party cavity wall insulation # • floor insulation (solid and underfloor) # # OR # # all measures (except any exempted measure referred to in paragraph 4.28) # listed in paragraph a) must already be installed # # With this in mind, we look for 2 clases # 1) The property is fully insulated apart from the loft (<200mm insulation) # 2) THe property is fully insulated self.standardised_asset_list["solar_landlord_walls_insulated"] = ( self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin( ["filled cavity", "insulated solid brick"] ) ) # TODO: We don't have information about the roof from this landlord self.standardised_asset_list["solar_epc_walls_insulated"] = ( self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().str.contains( "|".join(self.EPC_INSULATED_WALLS_SUBSTRINGS) ) ) # We merge on the u-value for average thermal transmittance roof_uvalue_data = pd.DataFrame(cleaned["roof-description"]) roof_uvalue_data = roof_uvalue_data[ ~pd.isnull(roof_uvalue_data["thermal_transmittance"]) ][["original_description", "thermal_transmittance"]].rename( columns={ "original_description": self.EPC_API_DATA_NAMES["roof-description"], "thermal_transmittance": "roof_u_value" } ) self.standardised_asset_list = self.standardised_asset_list.merge( roof_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["roof-description"] ) # If the u-value of a roof is less than 0.7 we consider it insulated self.standardised_asset_list["solar_epc_roof_insulated"] = ( self.standardised_asset_list[self.EPC_API_DATA_NAMES["roof-description"]].str.lower().str.contains( "|".join(self.EPC_INSULATED_ROOF_SUBSTRINGS), regex=False ) | ( self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( lambda x: int(x) >= 270 if str(x).isdigit() else False ) ) | ( self.standardised_asset_list["roof_u_value"].apply( lambda x: x <= 0.7 if not pd.isnull(x) else False ) ) ) self.standardised_asset_list["solar_epc_loft_needs_topup"] = self.standardised_asset_list[ self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( lambda x: int(x) < 270 if str(x).isdigit() else False ) self.standardised_asset_list["solar_epc_floor_is_solid"] = self.standardised_asset_list[ self.EPC_API_DATA_NAMES["floor-description"] ].str.lower().str.contains("solid") self.standardised_asset_list["solar_epc_floor_is_solid"] = ( self.standardised_asset_list["solar_epc_floor_is_solid"].fillna(False) ) z = self.standardised_asset_list[ self.standardised_asset_list["solar_epc_floor_is_solid"] == True ]