diff --git a/.idea/terraform.xml b/.idea/terraform.xml new file mode 100644 index 00000000..cd46a3d3 --- /dev/null +++ b/.idea/terraform.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py new file mode 100644 index 00000000..306edd99 --- /dev/null +++ b/asset_list/AssetList.py @@ -0,0 +1,1518 @@ +import hashlib +import os +import re +import tiktoken +from pprint import pprint +from datetime import datetime +from openai import OpenAI +import numpy as np +import pandas as pd +from fuzzywuzzy import process +from utils.logger import setup_logger +from backend.SearchEpc import SearchEpc +from BaseUtility import Definitions +import asset_list.mappings.property_type as property_type_mappings +import asset_list.mappings.walls as walls_mappings +import asset_list.mappings.heating_systems as heating_mappings +import asset_list.mappings.exising_pv as existing_pv_mappings + +from recommendations.recommendation_utils import ( + estimate_perimeter, + estimate_external_wall_area, + estimate_number_of_floors +) + +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes + +logger = setup_logger() + +# OpenAI API Key (set this in your environment variables for security) +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") + + +class DataRemapper: + def __init__(self, standard_values, standard_map=None, max_tokens=1000): + """ + Initialize the remapper with standard values and a predefined mapping. + + :param standard_values: Set of allowed standardized values. + :param standard_map: Dictionary of common remappings {raw_value: standard_value}. + """ + self.standard_values = standard_values + self.standard_map = standard_map + self.fuzzy_threshold = 90 # Adjust fuzzy matching sensitivity + self.ai_model = "gpt-4-turbo" # Use gpt-3.5-turbo for cheaper processing + + # Tokenizer for counting tokens + self.tokenizer = tiktoken.encoding_for_model(self.ai_model) + + # Track token usage and remap dictionary + self.total_tokens_used = 0 + self.total_cost = 0 + self.remap_dict = {} # {original_value: standardized_value} + self.max_tokens = max_tokens # Limit for OpenAI API + + # Memoization for AI calls + self.ai_cache = {} # {tuple(unmapped_values): {original_value: standardized_value}} + # Capture the reponse for debugging + self.ai_response = None + + # OpenAI pricing (as of Feb 2024) + self.pricing = { + "gpt-4-turbo": {"input": 0.01 / 1000, "output": 0.03 / 1000}, + "gpt-3.5-turbo": {"input": 0.0015 / 1000, "output": 0.002 / 1000}, + } + + self.openai_client = OpenAI(api_key=OPENAI_API_KEY) + + @staticmethod + def clean_string(text): + """Basic text cleaning: remove extra spaces, punctuation, and normalize case.""" + if not isinstance(text, str): + return None + text = text.strip().lower() + text = re.sub(r'[^\w\s]', '', text) # Remove punctuation + # Replace double strings + text = re.sub(r'\s+', ' ', text) + return text + + def fuzzy_match(self, text): + """Use fuzzy matching to find the closest standard value.""" + match, score = process.extractOne(text, self.standard_values) if text else (None, 0) + return match if score >= self.fuzzy_threshold else None + + def count_tokens(self, text): + """Estimate the number of tokens in a given text.""" + return len(self.tokenizer.encode(text)) if text else 0 + + def ai_standardize(self, unmapped_values): + """Call OpenAI API **once** for all unmapped values to minimize cost, with memoization.""" + if not unmapped_values: + return {} + + unmapped_tuple = tuple(sorted(unmapped_values)) # Ensure consistency for memoization + if unmapped_tuple in self.ai_cache: + return self.ai_cache[unmapped_tuple] # Return memoized result + + prompt = f""" + You are an expert in data classification. Standardize each of these values into one of the categories: + {list(self.standard_values)}. + + Return only a JSON dictionary where: + - The keys are the original values. + - The values are the standardized ones. + + Strictly return JSON **without markdown formatting** or extra text. + + Example Output: + {{ + "BLKHOUS": "block house", + "BEDSIT": "bedsit" + }} + + Values to standardize: + {unmapped_values} + """ + + # Count input tokens + input_tokens = self.count_tokens(prompt) + if input_tokens > self.max_tokens: + raise ValueError("Input tokens exceed the maximum limit.") + + logger.info("Calling OpenAI API for standardization...") + response = self.openai_client.chat.completions.create( + model=self.ai_model, + messages=[{"role": "user", "content": prompt}], + max_tokens=self.max_tokens, + temperature=0.1, + ) + + output_text = response.choices[0].message.content.strip() + output_tokens = self.count_tokens(output_text) # Count output tokens + + # Track total token usage + self.total_tokens_used += input_tokens + output_tokens + + # Estimate cost + input_cost = input_tokens * self.pricing[self.ai_model]["input"] + output_cost = output_tokens * self.pricing[self.ai_model]["output"] + self.total_cost += input_cost + output_cost + + try: + # Parse response as dictionary + mapping = eval(output_text) # OpenAI should return a valid dictionary + except: + mapping = {val: "unknown" for val in unmapped_values} # Fallback + + # Memoize the AI response + self.ai_cache[unmapped_tuple] = mapping + # We store the raw AI response for debugging + logger.debug(f"AI Response: {mapping}") + self.ai_response = output_text + + return mapping + + def standardize_list(self, values_to_remap): + """ + Standardizes a list of values and returns a dictionary {original_value: standardized_value}. + + :param values_to_remap: List of raw values to standardize. + :return: Dictionary {original_value: standardized_value}. + """ + unique_values = set(values_to_remap) # Process only unique values + + unmapped_values = [] + for value in unique_values: + if pd.isna(value): # Handle NaN values + self.remap_dict[value] = "unknown" + continue + + cleaned_value = self.clean_string(value) + + # Rule-Based Check (Predefined Mapping) + if cleaned_value in self.standard_map or value in self.standard_map: + self.remap_dict[value] = ( + self.standard_map[cleaned_value] if cleaned_value in self.standard_map else self.standard_map[value] + ) + continue + + if value.lower() in self.standard_map: + self.remap_dict[value] = self.standard_map[value.lower()] + continue + + # Exact Match in Standard Values + if cleaned_value in self.standard_values: + self.remap_dict[value] = cleaned_value + continue + + # Fuzzy Matching + fuzzy_match = self.fuzzy_match(cleaned_value) + if fuzzy_match: + self.remap_dict[value] = fuzzy_match + continue + + # Capture anything that wasn't mapped + unmapped_values.append(value) + + # AI Model - remap anything unmapped (batch request) + ai_mapping = self.ai_standardize(unmapped_values) + self.remap_dict.update(ai_mapping) + + return self.remap_dict + + def report_usage(self): + """Prints a summary of token usage and cost.""" + print(f"\n🔹 Total Tokens Used: {self.total_tokens_used}") + print(f"💰 Estimated Cost: ${self.total_cost:.4f}") + + +class AssetList: + """ + This class is used to standardise asset lists so that we can process the core information in a consistent manner. + """ + + EPC_API_DATA_NAMES = { + "uprn": "epc_os_uprn", + "address1": "epc_address1", + "address": "epc_address", + "postcode": "epc_postcode", + "inspection-date": "epc_inspection_date", + "current-energy-efficiency": "epc_sap_score_on_register", + "current-energy-rating": "epc_rating_on_register", + "property-type": "epc_property_type", + "built-form": "epc_archetype", + "total-floor-area": "epc_total_floor_area", + "construction-age-band": "epc_age_band", + "floor-height": "epc_floor_height", + "number-habitable-rooms": "epc_number_habitable_rooms", + "walls-description": "epc_wall_construction", + "roof-description": "epc_roof_construction", + "floor-description": "epc_floor_construction", + "mainheat-description": "epc_heating_type", + 'mainheatcont-description': "epc_heating_controls", + "secondheat-description": "epc_secondary_heating", + "transaction-type": "epc_reason", + "energy-consumption-current": "epc_heat_demand", + "photo-supply": "epc_photo_supply", + "estimated": "estimated" + } + FIND_EPC_DATA_NAMES = { + "heating_text": "epc_estiamted_heating_kwh", + "hot_water_text": "epc_estimated_hotwater_kwh", + 'Assessor’s name': "epc_assessor_name", + "Assessor's Telephone": "epc_assessor_telephone", + "Assessor's Email": "epc_assessor_email", + "Accreditation scheme": "epc_assessor_accreditation", + "Assessor’s ID": "epc_assessor_id", + "Solar photovoltaics": "epc_solar_pv" + } + + DATETIME_REMAP = { + "Pre 1900": datetime(year=1899, month=12, day=31), + } + + # These are the accepted methods we have for cleaning the address1 column + ADDRESS_1_CLEANING_METHODS = [ + "first_two_words", # This method will split on the fist two words, where the separator is a space + "first_word", # This method will split on the first word, where the separator is a space + "house_number_extraction", # This method will use the NLP model in SearchEPC to extract the housenumber + # "address1_extraction" # This method will use the NLP model to extract address1 + ] + + # Standard column Names + STANDARD_ADDRESS_1 = "domna_address_1" + STANDARD_POSTCODE = "domna_postcode" + STANDARD_FULL_ADDRESS = "domna_full_address" + STANDARD_YEAR_BUILT = "landlord_year_built" + STANDARD_UPRN = "ordnance_survey_uprn" + STANDARD_LANDLORD_PROPERTY_ID = "landlord_property_id" + STANDARD_PROPERTY_TYPE = "landlord_property_type" + STANDARD_WALL_CONSTRUCTION = "landlord_wall_construction" + STANDARD_HEATING_SYSTEM = "landlord_heating_system" + STANDARD_EXISTING_PV = "landlord_existing_pv" + + DOMNA_PROPERTY_ID = "domna_property_id" + + # Regular expression for identifying if the address might point to multiple units + MULTI_UNIT_REGEX = re.compile(r'\b([A-Za-z0-9]+)-([A-Za-z0-9]+)\b') + + # List of columns relating to the non-intrusive data + NON_INTRUSIVES_COLNAMES = [ + "Archetype", "Construction", "Insulated", "Material", "CIGA Check Required", + "PV, ACCESS ISSUE, SEE NOTES", "OFF GAS - ROOF ORIENTATION", + "Any further surveyor notes", 'Surveyors Name' + ] + + # This SAP threshold is a key search criteria for properties that may be eligible for extraction + FILLED_CAVITY_SAP_THRESHOLD = 75 + # This SAP the + EMPTY_CAVITY_SAP_THRESHOLD = 75 + # Any EPC deemed to have been conducted prior to this year is deemed to be unreliable + EPC_YEAR_THRESHOLD = pd.Timestamp.now().year - 5 + + # Attributes - these are columns that we produce, calcualted based on other pieces of data + ATTRIBUTE_HAS_SOLAR = "attribute_has_solar" + ATTRIBUTE_NUMBER_OF_FLOORS = "attribute_est_number_floors" + ATTRIBUTE_ESTIMATED_PERIMETER = "attribute_est_perimter" + ATTRIBUTE_HEAT_LOSS_AREA = "attribute_heat_loss_area" + ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS = "attribute_epc_roof_insulation_thickness" + ATTRIBUTE_SAP_THRESHOLD_AND_BELOW = f"sap_rating_{FILLED_CAVITY_SAP_THRESHOLD}_and_below" + ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD = f"epc_is_pre_{EPC_YEAR_THRESHOLD}" + + # These are the descriptions that we look for in the EPC data that are indicative of no insulation + EPC_NO_WALL_INSULATION_DESCRIPTIONS = [ + "cavity wall, as built, no insulation (assumed)", + "cavity wall, as built, partial insulation (assumed)", + "cavity wall, as built, partial insulation", + "cavity wall, as built, no insulation", + ] + + # List of strings that we look for in the EPC data, where substrings indicate that the wall is insulated + EPC_INSULATED_WALLS_SUBSTRINGS = [ + ", insulated", "with external insulation", "with internal insulation", "filled cavity" + ] + + # List of strings that we look for in the EPC data, where substrings indicate that the roof is insulated + EPC_INSULATED_ROOF_SUBSTRINGS = [ + "(another dwelling above)", ", insulated", ", insulated (assumed) ", + ", ceiling insulated", + ] + + def __init__( + self, + local_filepath, + sheet_name, + address1_colname, + postcode_colname, + full_address_colname, + landlord_property_id=None, + full_address_cols_to_concat=None, + missing_postcodes_method=None, + address1_extraction_method=None, + landlord_year_built=None, + landlord_uprn=None, + landlord_property_type=None, + landlord_wall_construction=None, + landlord_heating_system=None, + landlord_existing_pv=None, + header=0 + ): + self.local_filepath = local_filepath + self.sheet_name = sheet_name + # Read in the data + self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name) + self.standardised_asset_list = self.raw_asset_list.copy() + # Will be used to store aggregated figures against the various work types + self.work_type_figures = {} + self.work_type_breakdowns = {} + self.flat_data = None + self.duplicated_addresses = None + + # We detect the presence of the non-intrusive columns + self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False + + # Names of columns + self.landlord_property_id = landlord_property_id + self.address1_colname = address1_colname + self.postcode_colname = postcode_colname + self.full_address_colname = full_address_colname + self.landlord_year_built = landlord_year_built + self.landlord_uprn = landlord_uprn + self.landlord_property_type = landlord_property_type + self.landlord_wall_construction = landlord_wall_construction + self.landlord_heating_system = landlord_heating_system + self.landlord_existing_pv = landlord_existing_pv + + # parameters for cleaning + self.full_address_cols_to_concat = full_address_cols_to_concat + self.missing_postcodes_method = missing_postcodes_method + self.address1_extraction_method = address1_extraction_method + + self.debug_information = { + "property_type": None, + "wall_construction": None, + "heating_system": None, + "existing_pv": None + } + + self.variable_mappings = {} + + self.rename_map = {} + self.keep_variables = [] + + # Finally, we handle the case where the landlord's property ID is actually the OS UPRN + if (self.landlord_uprn == self.landlord_property_id) and (self.landlord_property_id is not None): + self.standardised_asset_list[self.STANDARD_UPRN] = self.standardised_asset_list[self.landlord_uprn].copy() + # Update the reference to landlord UPRn + self.landlord_uprn = self.STANDARD_UPRN + + def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"): + + if method not in self.ADDRESS_1_CLEANING_METHODS: + raise ValueError(f"Method {method} for producing address1 not recognized") + + if method == "first_two_words": + asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") + return asset_list + + if method == "first_word": + asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[0] + return asset_list + + if method == "house_number_extraction": + asset_list[self.address1_colname] = asset_list.apply( + lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]), + axis=1 + ) + return asset_list + + raise ValueError(f"Method {method} not recognized") + + @staticmethod + def _address1_extraction(x): + pass + + def create_property_id(self): + """ + This function creates the domna property ID, which is simply a hash of the full address and postcode + We want all figures to be positive + :return: + """ + + # We'll remove punctuation and whitespace from the address, before hashing to produce an ID + + def _make_hash(value): + """Generates a stable SHA256 hash suffix and appends it to a cleaned version of the value.""" + # Normalize and remove special characters for cleaner ID + cleaned_value = re.sub(r"[^\w\s-]", "", value).replace(" ", "_").lower() + + # Generate SHA-256 hash and truncate it + short_hash = hashlib.sha256(value.encode()).hexdigest()[:12] + + return f"{cleaned_value}-{short_hash}" + + # Apply transformation + self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = ( + self.standardised_asset_list[self.full_address_colname] + + self.standardised_asset_list[self.postcode_colname] + ).str.strip().str.replace(r"[^\w\s]", "", regex=True).str.replace(" ", "").str.lower().apply(_make_hash) + + @staticmethod + def _strip_postcode_from_full_address(full_address, postcode): + cleaned = full_address.replace(postcode, "") + # Remove any trailing commas and spaces + cleaned = cleaned.rstrip(", ").strip(",").strip() + return cleaned + + @classmethod + def _identify_multi_address(cls, address): + # We check if the address is comma separated + if "," in address: + address1_section = address.split(",")[0] + # We look for string in the form (x-y) + return bool(cls.MULTI_UNIT_REGEX.search(address1_section)) + + @staticmethod + def _convert_uprn(x): + """ + Used to convert UPRNS to integer strings + :param x: uprn to convert + :return: converted uprn + """ + + if pd.isnull(x): + return x + + # check if numeric + if np.isreal(x): + return str(int(x)) + + if str(x).isdigit(): + return str(int(x)) + return x + + def init_standardise(self): + """ + This function is used to standardise the asset list + :return: standardised asset list + """ + + # Remove rows without a postcode + if self.postcode_colname is not None: + self.standardised_asset_list = self.standardised_asset_list.dropna(subset=[self.postcode_colname]) + + # We clean up portential non-breaking spaces, and double spaces + for col in [ + c for c in [self.postcode_colname, self.full_address_colname, self.address1_colname] if + c is not None + ]: + self.standardised_asset_list[col] = self.standardised_asset_list[col].astype(str) + self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace('\xa0', ' ', regex=False) + self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace(' ', ' ', regex=False) + + if self.address1_colname is None: + if self.address1_extraction_method is None: + raise ValueError("Missing address 1 - please specify an extraction method") + self.address1_colname = self.STANDARD_ADDRESS_1 + # If we do not have this, we produce it + self.standardised_asset_list = self._extract_address1( + asset_list=self.standardised_asset_list, + full_address_col=self.full_address_colname, + postcode_col=self.postcode_colname, + method=self.address1_extraction_method + ) + + if self.full_address_colname is None: + if not self.full_address_cols_to_concat: + raise ValueError("Missing full address - please specify columns to concatenate") + self.full_address_colname = self.STANDARD_FULL_ADDRESS + self.standardised_asset_list[self.full_address_colname] = ( + self.standardised_asset_list[self.full_address_cols_to_concat].apply(lambda x: ", ".join(x), axis=1) + ) + else: + + # Make sure to strip the postcode out of the full address + self.standardised_asset_list[self.full_address_colname] = self.standardised_asset_list.apply( + lambda x: self._strip_postcode_from_full_address( + full_address=x[self.full_address_colname], + postcode=x[self.postcode_colname] + ), + axis=1 + ) + + # We create the domna property id + self.create_property_id() + + # Clean up the UPRN column, if the landlord has provided them + if self.landlord_uprn is not None: + self.standardised_asset_list[self.landlord_uprn] = ( + self.standardised_asset_list[self.landlord_uprn].apply(self._convert_uprn) + ) + + # We keep just the columns we care about and will work through the various columns and standardise + variables = [ + self.landlord_property_id, + self.DOMNA_PROPERTY_ID, + self.address1_colname, + self.postcode_colname, + self.full_address_colname, + self.landlord_uprn, + self.landlord_property_type, + self.landlord_year_built, + self.landlord_wall_construction, + self.landlord_heating_system, + self.landlord_existing_pv + ] + # Keep just non-null variables (e.g landlord may not provide uprn + self.keep_variables = [v for v in variables if v is not None] + self.rename_map = { + self.landlord_property_id: self.STANDARD_LANDLORD_PROPERTY_ID, + self.address1_colname: self.STANDARD_ADDRESS_1, + self.postcode_colname: self.STANDARD_POSTCODE, + self.full_address_colname: self.STANDARD_FULL_ADDRESS, + self.landlord_uprn: self.STANDARD_UPRN, + self.landlord_property_type: self.STANDARD_PROPERTY_TYPE, + self.landlord_year_built: self.STANDARD_YEAR_BUILT, + self.landlord_wall_construction: self.STANDARD_WALL_CONSTRUCTION, + self.landlord_heating_system: self.STANDARD_HEATING_SYSTEM, + self.landlord_existing_pv: self.STANDARD_EXISTING_PV + } + self.rename_map = {k: v for k, v in self.rename_map.items() if k is not None} + + if self.non_intrusives_present: + self.keep_variables += self.NON_INTRUSIVES_COLNAMES + self.rename_map = { + **self.rename_map, + **dict( + zip(self.NON_INTRUSIVES_COLNAMES, ["non-intrusives: " + c for c in self.NON_INTRUSIVES_COLNAMES]) + ) + } + + # We idenfiy addresses which are likely to be multi-addresses (i.g are rooms x-y) + self.standardised_asset_list["is_multi_address"] = self.standardised_asset_list[ + self.full_address_colname + ].apply(lambda x: self._identify_multi_address(x)) + + # We handle cleaning for walls, in the instance that the landlord provides us with EPC data and + # we see instances of "average thermal transmittance" in the description + self.standardised_asset_list[self.landlord_wall_construction] = np.where( + self.standardised_asset_list[self.landlord_wall_construction].str.lower().str.contains( + "average thermal transmittance" + ) == True, + "new build - average thermal transmittance", + self.standardised_asset_list[self.landlord_wall_construction] + ) + + # Clear our build year column + # We attempt to process the year built column + if self.landlord_year_built is not None: + # We check if we have a datetime - year built has not been renamed + if isinstance(self.standardised_asset_list[self.landlord_year_built].iloc[0], datetime): + # We treat any string columns - with common values we see + self.standardised_asset_list[self.landlord_year_built] = ( + self.standardised_asset_list[self.landlord_year_built].replace(self.DATETIME_REMAP) + ) + + self.standardised_asset_list[self.landlord_year_built] = pd.to_datetime( + self.standardised_asset_list[self.landlord_year_built] + ) + # Convert this to year + self.standardised_asset_list[self.landlord_year_built] = ( + self.standardised_asset_list[self.landlord_year_built].dt.year + ) + else: + # We attempt to convert the year built to a datetime, by detecting the format and converting + + def extract_year(date_str): + """ + Extracts the year from a date string in the format '01-Jul-YYYY'. + Returns the extracted year as an integer or None if the format is incorrect. + """ + known_errors = ["#MULTIVALUE"] + + if pd.isnull(date_str) or date_str in known_errors: + return None + + if isinstance(date_str, str): + match = re.match(r"\d{1,2}-[A-Za-z]{3}-(\d{4})", date_str) + if match: + return int(match.group(1)) # Extract the year and convert to integer + + if isinstance(date_str, datetime): + return date_str.year + + # Check if date_str is a year itself + if str(date_str).isdigit() & (len(str(date_str)) == 4): + return int(date_str) + + raise NotImplementedError("Unhandled format for year built - implement me") + + self.standardised_asset_list[self.landlord_year_built] = self.standardised_asset_list[ + self.landlord_year_built + ].apply(extract_year) + + # We now create standard lookups + to_remap = { + self.landlord_property_type: { + "standard_values": property_type_mappings.STANDARD_PROPERTY_TYPES, + "standard_map": property_type_mappings.PROPERTY_MAPPING + }, + self.landlord_wall_construction: { + "standard_values": walls_mappings.STANDARD_WALL_CONSTRUCTIONS, + "standard_map": walls_mappings.WALL_CONSTRUCTION_MAPPINGS + }, + self.landlord_heating_system: { + "standard_values": heating_mappings.STANDARD_HEATING_SYSTEMS, + "standard_map": heating_mappings.HEATING_MAPPINGS + }, + self.landlord_existing_pv: { + "standard_values": existing_pv_mappings.STANDARD_EXISTING_PV, + "standard_map": existing_pv_mappings.EXISTING_PV_MAPPINGS + } + } + # Keep just entries where the key is not None + to_remap = {k: v for k, v in to_remap.items() if k is not None} + + for variable, config in to_remap.items(): + logger.info("Standardising variable: %s", variable) + values_to_remap = self.standardised_asset_list[variable].unique() + # We want to map this to our standardised list of property types we're interested in + remapper = DataRemapper(standard_values=config["standard_values"], standard_map=config["standard_map"]) + remap_dictionary = remapper.standardize_list(values_to_remap=values_to_remap.tolist()) + self.variable_mappings[variable] = remap_dictionary + + # We now print out the variable mappings, which can be reviewed by the user, before the final standardised + # asset list is returned + for variable, mapping in self.variable_mappings.items(): + pprint(f"Variable: {variable}") + pprint(mapping) + # Print a space + print("\n") + pprint("=======================================") + + def apply_standardiation(self, override_empty_mappings=False): + """ + This function applies the standardisation to the asset list + :param override_empty_mappings: If true, will override the check for empty mappings. This is only relevant + if there are no categories which need remapping which is highly unlikely + :return: + """ + if not self.variable_mappings and not override_empty_mappings: + raise ValueError("Please run init_standardise first") + + logger.info("Applying standardisation to asset list") + + for variable, mapping in self.variable_mappings.items(): + self.standardised_asset_list[variable + "_original_from_landlord"] = ( + self.standardised_asset_list[variable].copy() + ) + self.standardised_asset_list[variable] = self.standardised_asset_list[variable].map(mapping) + + if self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum(): + # Drop the dupes + pprint( + f"There are {self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum()} duplicated " + f"addresses - dropping" + ) + + # Keep a record of duplicates + self.duplicated_addresses = self.standardised_asset_list[ + self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated() + ][[self.DOMNA_PROPERTY_ID, self.address1_colname, self.postcode_colname]].copy() + + self.standardised_asset_list = self.standardised_asset_list[ + ~self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated() + ] + + # Apply renames to our standard names + # Perform final variable selection and renaming: + + # We add the original columns to the keep variables + self.keep_variables += [ + k + "_original_from_landlord" for k in self.variable_mappings.keys() + ] + + self.standardised_asset_list = self.standardised_asset_list[self.keep_variables].rename( + columns=self.rename_map + ) + + # We fill any standard columns that are not in the data because they were not provided by the landlord + missing_variables = [ + v for v in [ + self.STANDARD_EXISTING_PV, + self.STANDARD_HEATING_SYSTEM, + self.STANDARD_UPRN, + self.STANDARD_PROPERTY_TYPE, + self.STANDARD_YEAR_BUILT, + self.STANDARD_WALL_CONSTRUCTION, + self.STANDARD_HEATING_SYSTEM, + self.STANDARD_EXISTING_PV + ] if v not in self.standardised_asset_list.columns + ] + for v in missing_variables: + self.standardised_asset_list[v] = None + + def merge_data(self, df: pd.DataFrame): + """ + Used to insert data into the standardised asset list, based on the domna property id + :return: + """ + if self.DOMNA_PROPERTY_ID not in df.columns: + raise ValueError(f"Dataframe must contain the column {self.DOMNA_PROPERTY_ID}") + + if df[self.DOMNA_PROPERTY_ID].duplicated().sum(): + raise ValueError(f"{self.DOMNA_PROPERTY_ID} contains duplicated IDs") + + self.standardised_asset_list = self.standardised_asset_list.merge( + df, how="left", on=self.DOMNA_PROPERTY_ID + ) + + def extract_attributes(self): + # Used to extracty the typical attributes that we use to identify viable work + + self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR] = ( + self.standardised_asset_list[self.FIND_EPC_DATA_NAMES["Solar photovoltaics"]] | + ~self.standardised_asset_list[self.EPC_API_DATA_NAMES["photo-supply"]].isin(["0.0", 0, None, ""]) + ) + + accepted_epc_property_types = ["House", "Flat", "Bungalow", "Maisonette"] + + # The logic here is: + # 1) Take the property type provided by the HA themselves + # 2) In absence of that, take the EPC property type + # 3) Otherwise use None + self.standardised_asset_list[self.ATTRIBUTE_NUMBER_OF_FLOORS] = self.standardised_asset_list.apply( + lambda x: estimate_number_of_floors( + property_type=( + x[self.STANDARD_PROPERTY_TYPE].title() if + x[self.STANDARD_PROPERTY_TYPE].title() in accepted_epc_property_types else ( + x[self.EPC_API_DATA_NAMES["property-type"]] if not + pd.isnull(x[self.EPC_API_DATA_NAMES["property-type"]]) else None + ) + ) + ), + axis=1 + ) + + self.standardised_asset_list[self.EPC_API_DATA_NAMES["total-floor-area"]] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["total-floor-area"]].astype(float) + ) + # Replace "" value with None + self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]].replace("", None) + ) + self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]].astype(float) + ) + + # Estimate the perimeter + self.standardised_asset_list[self.ATTRIBUTE_ESTIMATED_PERIMETER] = self.standardised_asset_list.apply( + lambda x: estimate_perimeter( + floor_area=x[self.EPC_API_DATA_NAMES["total-floor-area"]] / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], + num_rooms=x[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], + ), axis=1 + ) + + self.standardised_asset_list[self.ATTRIBUTE_HEAT_LOSS_AREA] = self.standardised_asset_list.apply( + lambda x: estimate_external_wall_area( + num_floors=x[self.ATTRIBUTE_NUMBER_OF_FLOORS], + floor_height=( + float(x[self.EPC_API_DATA_NAMES["floor-height"]]) if + x[self.EPC_API_DATA_NAMES["floor-height"]] else 2.5 + ), + perimeter=x[self.ATTRIBUTE_ESTIMATED_PERIMETER], + built_form=x[self.EPC_API_DATA_NAMES["built-form"]] + ), + axis=1 + ) + + self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = self.standardised_asset_list.apply( + lambda x: RoofAttributes(description=x[self.EPC_API_DATA_NAMES["roof-description"]]).process()[ + "insulation_thickness"] if not pd.isnull( + x[self.EPC_API_DATA_NAMES["roof-description"]]) else None, + axis=1 + ) + + # We produce some additional fields + # 1) Is the SAP rating below C75 + self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].astype(float) <= + self.FILLED_CAVITY_SAP_THRESHOLD + ) + # 2) Flag anything where the EPC is older than 5 years + self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] = ( + pd.to_datetime( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["inspection-date"]] + ).dt.year < self.EPC_YEAR_THRESHOLD + ) + + self.process_age_band() + + def process_age_band(self): + processed_age_band = [] + for _, x in self.standardised_asset_list.iterrows(): + + if pd.isnull(x[self.EPC_API_DATA_NAMES["construction-age-band"]]) or ( + x[self.EPC_API_DATA_NAMES["construction-age-band"]] in Definitions.DATA_ANOMALY_MATCHES + ): + processed_age_band.append( + { + self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], + "epc_year_lower_bound": None, + "epc_year_upper_bound": None, + "does_age_band_match_epc_age_band": "No EPC Age Band" + } + ) + continue + + # We exatract the upper and lower bounds + if x[self.EPC_API_DATA_NAMES["construction-age-band"]] in [ + "England and Wales: 2007 onwards", "England and Wales: 2012 onwards" + ]: + year_lower_bound = 2007 if x[self.EPC_API_DATA_NAMES[ + "construction-age-band"]] == "England and Wales: 2007 onwards" else 2012 + + if pd.isnull(x[self.STANDARD_YEAR_BUILT]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] >= year_lower_bound + else "EPC Age Band is older than Year Built" + ) + + processed_age_band.append( + { + self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], + "epc_year_lower_bound": year_lower_bound, + "epc_year_upper_bound": None, + "does_age_band_match_epc_age_band": age_band_matches + } + ) + continue + + if x[self.EPC_API_DATA_NAMES["construction-age-band"]] == "England and Wales: before 1900": + + if pd.isnull(x[self.STANDARD_YEAR_BUILT]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] < 1900 + else "EPC Age Band is newer than Year Built" + ) + + processed_age_band.append( + { + self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], + "epc_year_lower_bound": None, + "epc_year_upper_bound": 1899, + "does_age_band_match_epc_age_band": age_band_matches + } + ) + continue + + if x[self.EPC_API_DATA_NAMES["construction-age-band"]].isdigit(): + + if pd.isnull(x[self.STANDARD_YEAR_BUILT]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] == int( + x[self.EPC_API_DATA_NAMES["construction-age-band"]] + ) + else "EPC Age Band is different from Year Built" + ) + + processed_age_band.append( + { + self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], + "epc_year_lower_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]), + "epc_year_upper_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]), + "does_age_band_match_epc_age_band": age_band_matches + } + ) + continue + + # Oherwise, we extract the upper and lower bounds + age_band = x[self.EPC_API_DATA_NAMES["construction-age-band"]].split(": ")[1] + lower_date, upper_date = age_band.split("-") + + age_band_matches = ( + "EPC Age Band Matches Year Built" if (x[self.STANDARD_YEAR_BUILT] >= float(lower_date)) and ( + x[self.STANDARD_YEAR_BUILT] <= float(upper_date) + ) + else "EPC Age Band is older than Year Built" if x[self.STANDARD_YEAR_BUILT] > float(upper_date) + else "EPC Age Band is newer than Year Built" + ) + + processed_age_band.append( + { + self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], + "epc_year_lower_bound": int(lower_date), + "epc_year_upper_bound": int(upper_date), + "does_age_band_match_epc_age_band": age_band_matches + } + ) + + processed_age_band = pd.DataFrame(processed_age_band) + + self.standardised_asset_list = self.standardised_asset_list.merge( + processed_age_band, how="left" + ) + + def identify_worktypes(self, cleaned): + + if not self.non_intrusives_present: + raise NotImplementedError("Need to implement the case for non-intrusives") + + # If we have non-intrusives completed, we can use this to identify work types + + if self.non_intrusives_present: + ###################################################### + # Empty cavity: + ###################################################### + # 1) Has been flagged on the non-intrusives as being a cavity wall, empty or partially filled + # 2) The age is before 1995 + # 3) We don't remove anything that haas access issues yet + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = ( + (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & + (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") & + self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) & + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) & + ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"] + ] <= self.EMPTY_CAVITY_SAP_THRESHOLD + ) + ) + # Let's also flag work that looks eligible without the SAP filter + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] = ( + (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & + (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") & + self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) & + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) + ) + + # If non_intrusive_indicates_empty_cavity is True, + # set non_intrusive_indicates_empty_cavity_no_sap_filter to False + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] = np.where( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"], + False, + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] + ) + + self.standardised_asset_list["epc_indicates_empty_cavity"] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin( + self.EPC_NO_WALL_INSULATION_DESCRIPTIONS + ) & ( + self.standardised_asset_list["epc_year_upper_bound"] <= 1995 + ) & ( + ~self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] + ) & ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= self.EMPTY_CAVITY_SAP_THRESHOLD + ) + ) + + # If the EPC is esimtated, we defer to the non-intrusives + self.standardised_asset_list["epc_indicates_empty_cavity"] = np.where( + ( + self.standardised_asset_list["epc_indicates_empty_cavity"] & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & + self.standardised_asset_list["estimated"] + ), + False, + self.standardised_asset_list["epc_indicates_empty_cavity"] + ) + + ###################################################### + # Extraction + ###################################################### + + # as needing a CIGA check. What is the logic we should be applying here? + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( + (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") & + (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) & + (~self.standardised_asset_list['non-intrusives: Material'].isin( + ["GREY LOOSE BEAD", "COMPACTED BEAD", "FIBRE BATT NO CAVITY", "EMPTY NARROW BELOW 30mm"] + ) + ) & ( + self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] + ) + ) + + # Also include work without the SAP filter as optimistic + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = ( + (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") & + (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) & + (~self.standardised_asset_list['non-intrusives: Material'].isin( + ["GREY LOOSE BEAD", "COMPACTED BEAD", "FIBRE BATT NO CAVITY", "EMPTY NARROW BELOW 30mm"] + ) + ) + ) + + # Adjust + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = np.where( + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"], + False, + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] + ) + + ###################################################### + # Solar + ###################################################### + # Criteria: + # Check 1: Does the property have a valid heating system? + self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] = ( + self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( + ["air source heat pump", "ground source heat pump", "high heat retention storage heaters"] + ) + ) + + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] = ( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]] + .str.lower().str.contains("air source heat pump|ground source heat pump") + ) | ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheat-description"]].str.lower().str.contains( + "electric storage heaters" + ) & ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES[ + "mainheatcont-description"]] == "Controls for high heat retention storage heaters" + ) + ) + ) + + # Check 2: Does the property have solar already + self.standardised_asset_list["property_has_solar"] = ( + (self.standardised_asset_list[self.STANDARD_EXISTING_PV] == "already has PV") | + (self.standardised_asset_list["non-intrusives: PV, ACCESS ISSUE, SEE NOTES"] == "SOLAR PV ON ROOF") | + (self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR]) + ) + + # Check 3: Does the property meet the fabric condition + # Solar PV installs are subject to the minimum insulation requirements which means: + # 1) one of the following insulation measures must be installed as part of the same + # ECO4 project: + # • roof insulation (flat roof, pitched roof, room-in-roof) + # • exterior facing wall insulation (cavity wall, solid wall) + # • party cavity wall insulation + # • floor insulation (solid and underfloor) + # + # OR + # + # all measures (except any exempted measure referred to in paragraph 4.28) + # listed in paragraph a) must already be installed + # + # With this in mind, we look for 2 clases + # 1) The property is fully insulated apart from the loft (<200mm insulation) + # 2) THe property is fully insulated + + self.standardised_asset_list["solar_landlord_walls_insulated"] = ( + self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin( + ["filled cavity", "insulated solid brick"] + ) + ) + + self.standardised_asset_list["solar_non_intrusives_walls_insulated"] = ( + self.standardised_asset_list["non-intrusives: Insulated"].isin( + ["EWI", "RETRO DRILLED", "FILLED AT BUILD"] + ) + ) + + # TODO: We don't have information about the roof from this landlord + + # We merge on the u-value for average thermal transmittance + walls_uvalue_data = pd.DataFrame(cleaned["walls-description"]) + walls_uvalue_data = walls_uvalue_data[ + ~pd.isnull(walls_uvalue_data["thermal_transmittance"]) + ][["original_description", "thermal_transmittance"]].rename( + columns={ + "original_description": self.EPC_API_DATA_NAMES["walls-description"], + "thermal_transmittance": "walls_u_value" + } + ) + self.standardised_asset_list = self.standardised_asset_list.merge( + walls_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["walls-description"] + ) + + self.standardised_asset_list["solar_epc_walls_insulated"] = ( + ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES[ + "walls-description"]].str.lower().str.contains( + "|".join( + self.EPC_INSULATED_WALLS_SUBSTRINGS) + ) + ) | ( + self.standardised_asset_list[ + "walls_u_value"].apply( + lambda x: x <= 0.7 if not pd.isnull(x) else False + ) + ) + ) + + # We merge on the u-value for average thermal transmittance + roof_uvalue_data = pd.DataFrame(cleaned["roof-description"]) + roof_uvalue_data = roof_uvalue_data[ + ~pd.isnull(roof_uvalue_data["thermal_transmittance"]) + ][["original_description", "thermal_transmittance"]].rename( + columns={ + "original_description": self.EPC_API_DATA_NAMES["roof-description"], + "thermal_transmittance": "roof_u_value" + } + ) + + self.standardised_asset_list = self.standardised_asset_list.merge( + roof_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["roof-description"] + ) + + # If the u-value of a roof is less than 0.7 we consider it insulated + self.standardised_asset_list["solar_epc_roof_insulated"] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["roof-description"]].str.lower().str.contains( + "|".join(self.EPC_INSULATED_ROOF_SUBSTRINGS), regex=False + ) | ( + self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( + lambda x: int(x) >= 200 if str(x).isdigit() else False + ) + ) | ( + self.standardised_asset_list["roof_u_value"].apply( + lambda x: x <= 0.7 if not pd.isnull(x) else False + ) + ) + ) + + self.standardised_asset_list["solar_epc_loft_needs_topup"] = self.standardised_asset_list[ + self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( + lambda x: int(x) < 200 if str(x).isdigit() else False + ) + + # TODO: Fill with False - should be temp! + self.standardised_asset_list["epc_has_floor_recommendation"] = ( + self.standardised_asset_list["epc_has_floor_recommendation"].fillna(False) + ) + + # We merge on the u-value for average thermal transmittance + floors_uvalue_data = pd.DataFrame(cleaned["floor-description"]) + floors_uvalue_data = floors_uvalue_data[ + ~pd.isnull(floors_uvalue_data["thermal_transmittance"]) + ][["original_description", "thermal_transmittance"]].rename( + columns={ + "original_description": self.EPC_API_DATA_NAMES["floor-description"], + "thermal_transmittance": "floor_u_value" + } + ) + + # Merge on + self.standardised_asset_list = self.standardised_asset_list.merge( + floors_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["floor-description"] + ) + + # We assume that a U-value of 0.5 or below is indicative of an insulated floor + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] = ( + ( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str + .lower().str.contains("solid") + ) & ( + ~self.standardised_asset_list["epc_has_floor_recommendation"] + ) & ( + # We do not utilise estimated EPCs for this method because we will always find that + # "epc_has_floor_recommendation" is False + (self.standardised_asset_list["estimated"] == False) + ) + ) | ( + ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["floor-description"]].str.lower().str.contains("solid") + ) & ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str.lower() + .str.contains(", insulated") + ) + ) + ) + + # Check for other floor types, insulated + self.standardised_asset_list["solar_epc_floor_is_other_insulated"] = ( + # The floor is suspended and insulated + ( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str + .lower().str.contains("suspended") + ) & ( + ~self.standardised_asset_list["epc_has_floor_recommendation"] + ) & ( + # We do not utilise estimated EPCs for this method because we will always find that + # "epc_has_floor_recommendation" is False + self.standardised_asset_list["estimated"] == False + ) + ) | ( + ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["floor-description"] + ].str.lower().str.contains("suspended") + ) & ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["floor-description"] + ].str.lower().str.contains(", insulated") + ) + ) | ( + self.standardised_asset_list["floor_u_value"].apply( + lambda x: x <= 0.5 if not pd.isnull(x) else False + ) + ) + ) + + # We now put together the criteria: + # Flag properties that look eligible for solar, that have solid floors + # TODO: We'll need to revise this + self.standardised_asset_list["solar_eligible_solid_floor"] = ( + # Landlord data or EPC data indicates the heating system is appropriate + ( + self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] | + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] + ) & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + ( + self.standardised_asset_list["solar_landlord_walls_insulated"] | + self.standardised_asset_list["solar_epc_walls_insulated"] | + self.standardised_asset_list["solar_non_intrusives_walls_insulated"] + ) & + # Roof is insulated + self.standardised_asset_list["solar_epc_roof_insulated"] & + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] + ) + + # Solid floor but needs a loft top-up + self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"] = ( + # Landlord data or EPC data indicates the heating system is appropriate + ( + self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] | + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] + ) & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + ( + self.standardised_asset_list["solar_landlord_walls_insulated"] | + self.standardised_asset_list["solar_epc_walls_insulated"] | + self.standardised_asset_list["solar_non_intrusives_walls_insulated"] + ) & + # Roof is insulated + self.standardised_asset_list["solar_epc_loft_needs_topup"] & + self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] + ) + + # Other floor type, fully insulated + + self.standardised_asset_list["solar_eligible_other_floor"] = ( + # Landlord data or EPC data indicates the heating system is appropriate + ( + self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] | + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] + ) & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + ( + self.standardised_asset_list["solar_landlord_walls_insulated"] | + self.standardised_asset_list["solar_epc_walls_insulated"] + ) & + # Roof is insulated + self.standardised_asset_list["solar_epc_roof_insulated"] & + self.standardised_asset_list["solar_epc_floor_is_other_insulated"] + ) + + # Other floor type, needs loft top-up + self.standardised_asset_list["solar_eligible_other_floor_needs_loft"] = ( + # Landlord data or EPC data indicates the heating system is appropriate + ( + self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] | + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] + ) & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + ( + self.standardised_asset_list["solar_landlord_walls_insulated"] | + self.standardised_asset_list["solar_epc_walls_insulated"] + ) & + # Roof need loft top-up + self.standardised_asset_list["solar_epc_loft_needs_topup"] & + # Floor is not solid, but is insulated + self.standardised_asset_list["solar_epc_floor_is_other_insulated"] + ) + + # Drop anything we don't need + self.standardised_asset_list = self.standardised_asset_list.drop( + columns=["walls_u_value", "roof_u_value", "floor_u_value"] + ) + + # Adjust flagged extraction jobs to remove anything for solar + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] & + ~self.standardised_asset_list["solar_eligible_solid_floor"] & + ~self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"] + # ~self.standardised_asset_list["solar_eligible_other_floor"] & + # ~self.standardised_asset_list["solar_eligible_other_floor_needs_loft"] + ) + + blocks_of_flats = self.standardised_asset_list[ + self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats" + ] + + non_blocks_of_flats = self.standardised_asset_list[ + self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats" + ] + + # Produce some aggregate figures + self.work_type_figures = { + # Empty cavity from non-intrusives + "Empty Cavity (non-intrusives)": non_blocks_of_flats["non_intrusive_indicates_empty_cavity"].sum(), + "Empty Cavity (non-intrusives, blocks of flats)": ( + blocks_of_flats["non_intrusive_indicates_empty_cavity"].sum() + ), + "Empty Cavity (non-intrusives, no SAP filter)": ( + non_blocks_of_flats["non_intrusive_indicates_empty_cavity_no_sap_filter"].sum() + ), + "Empty Cavity (non-intrusives, no SAP filter, blocks of flats)": ( + blocks_of_flats["non_intrusive_indicates_empty_cavity_no_sap_filter"].sum() + ), + "Empty Cavity (EPC)": ( + ( + non_blocks_of_flats["epc_indicates_empty_cavity"] & + ~non_blocks_of_flats["non_intrusive_indicates_empty_cavity"] + ).sum() + ), + "Empty Cavity (EPC, blocks of flat)": ( + ( + blocks_of_flats["epc_indicates_empty_cavity"] & + ~blocks_of_flats["non_intrusive_indicates_empty_cavity"] + ).sum() + ), + "Cavity Extraction": ( + ( + ~non_blocks_of_flats["non_intrusive_indicates_empty_cavity"] & + ~non_blocks_of_flats["epc_indicates_empty_cavity"] & + non_blocks_of_flats["non_intrusive_indicates_cavity_extraction"] + ).sum() + ), + "Cavity Extraction (blocks of flats)": ( + ( + ~blocks_of_flats["non_intrusive_indicates_empty_cavity"] & + ~blocks_of_flats["epc_indicates_empty_cavity"] & + blocks_of_flats["non_intrusive_indicates_cavity_extraction"] + ).sum() + ), + "Cavity Extraction (no SAP filter)": ( + ( + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & + ~self.standardised_asset_list["epc_indicates_empty_cavity"] & + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] + ).sum() + ), + "Solar PV (Solid Floor)": ( + self.standardised_asset_list["solar_eligible_solid_floor"].sum() + ), + "Solar PV (Solid Floor, Needs Loft Top-up)": ( + self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"].sum() + ), + "Solar PV (Other Floor)": ( + self.standardised_asset_list["solar_eligible_other_floor"].sum() + ), + "Solar PV (Other Floor, Needs Loft Top-up)": ( + self.standardised_asset_list["solar_eligible_other_floor_needs_loft"].sum() + ) + } + + # We produce a breakdown of the property types, for cavity fills + cavity_fills = self.standardised_asset_list[ + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] | ( + self.standardised_asset_list["epc_indicates_empty_cavity"] + ) + ] + + self.work_type_breakdowns = { + "empty_cavity": cavity_fills[self.STANDARD_PROPERTY_TYPE].value_counts() + } + + # Finally, we note why each property has been flagged + self.standardised_asset_list["cavity_reason"] = None + self.standardised_asset_list["cavity_reason"] = np.where( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"], + "Non-Intrusive Data Showed Empty Cavity", + self.standardised_asset_list["cavity_reason"] + ) + self.standardised_asset_list["cavity_reason"] = np.where( + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"], + "Non-Intrusive Data Showed Empty Cavity but all SAP scores allowed", + self.standardised_asset_list["cavity_reason"] + ) + self.standardised_asset_list["cavity_reason"] = np.where( + ( + self.standardised_asset_list["epc_indicates_empty_cavity"] & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] + ), + "EPC Data Showed Empty Cavity", + self.standardised_asset_list["cavity_reason"] + ) + # Flag extraction + self.standardised_asset_list["cavity_reason"] = np.where( + ( + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] & + pd.isnull(self.standardised_asset_list["cavity_reason"]) + ), + "Non-Intrusive Data Showed Cavity Extraction", + self.standardised_asset_list["cavity_reason"] + ) + # extraction no sap filter + self.standardised_asset_list["cavity_reason"] = np.where( + ( + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] & + pd.isnull(self.standardised_asset_list["cavity_reason"]) + ), + "Non-Intrusive Data Showed Cavity Extraction but all SAP scores allowed", + self.standardised_asset_list["cavity_reason"] + ) + + # Flag solar + self.standardised_asset_list["solar_reason"] = None + self.standardised_asset_list["solar_reason"] = np.where( + self.standardised_asset_list["solar_eligible_solid_floor"], + "Solid Floor, Insulated, No Solar", + self.standardised_asset_list["solar_reason"] + ) + self.standardised_asset_list["solar_reason"] = np.where( + self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"], + "Solid Floor, Insulated, Needs Loft", + self.standardised_asset_list["solar_reason"] + ) + self.standardised_asset_list["solar_reason"] = np.where( + self.standardised_asset_list["solar_eligible_other_floor"], + "Other Floor, Insulated, No Solar", + self.standardised_asset_list["solar_reason"] + ) + self.standardised_asset_list["solar_reason"] = np.where( + self.standardised_asset_list["solar_eligible_other_floor_needs_loft"], + "Other Floor, Insulated, Needs Loft", + self.standardised_asset_list["solar_reason"] + ) + + def flat_analysis(self): + + # We need to deduce the building name - we strip out the house number + + # We want to deduce if flats have 50% of the properties below C75 + # We group by postcode and property type + grouped = self.standardised_asset_list.groupby( + [self.STANDARD_POSTCODE, self.STANDARD_PROPERTY_TYPE] + ) + + flat_data = [] + for _, group in grouped: + if "flat" in group[self.STANDARD_PROPERTY_TYPE].values: + num_flats = group[self.STANDARD_PROPERTY_TYPE].shape[0] + num_below_c75 = group[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"] + ].lt(self.FILLED_CAVITY_SAP_THRESHOLD).sum() + # Check if any flats are below C69 + num_flats_below_c69 = group[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"] + ].lt(69).sum() + + flat_data.append( + { + "Postcode": group[self.STANDARD_POSTCODE].iloc[0], + "Property Type": "Flat", + "Number of Flats with EPC": num_flats, + "Number of Flats below C75": num_below_c75, + "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats), + "Number of Flats Below C69": num_flats_below_c69, + } + ) + + flat_data = pd.DataFrame(flat_data) + + self.flat_data = flat_data diff --git a/asset_list/app.py b/asset_list/app.py new file mode 100644 index 00000000..84999e93 --- /dev/null +++ b/asset_list/app.py @@ -0,0 +1,480 @@ +import os +import time +import json +import pandas as pd +import numpy as np +from tqdm import tqdm +from pprint import pprint +import msgpack +from utils.s3 import read_from_s3 +from asset_list.AssetList import AssetList +from asset_list.mappings.property_type import PROPERTY_MAPPING +from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS +from asset_list.mappings.heating_systems import HEATING_MAPPINGS +from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS + +from dotenv import load_dotenv +from backend.SearchEpc import SearchEpc +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def get_data( + df, manual_uprn_map, epc_api_only=False, row_id_name="row_id" +): + uprn_column = AssetList.STANDARD_UPRN + fulladdress_column = AssetList.STANDARD_FULL_ADDRESS + address1_column = AssetList.STANDARD_ADDRESS_1 + postcode_column = AssetList.STANDARD_POSTCODE + + # These re-map the standard property types to forms accepted by the EPC api, so we can predict EPCs + property_type_map = { + "house": "House", + "flat": "Flat", + "maisonette": "Maisonette", + "bungalow": "Bungalow", + "block house": "House", + "coach house": "House", + "bedsit": "Flat" + } + + epc_data = [] + errors = [] + no_epc = [] + for _, home in tqdm(df.iterrows(), total=len(df)): + try: + + # If we have a block of flats, we cannot retrieve this data + if home[AssetList.STANDARD_PROPERTY_TYPE] == "block of flats": + no_epc.append(home[row_id_name]) + continue + + postcode = home[postcode_column] + house_number = str(home[address1_column]).strip() + full_address = home[fulladdress_column].strip() + house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode) + if house_no is None: + house_no = house_number + uprn = manual_uprn_map.get(full_address, None) + if uprn is None and home.get(uprn_column): + uprn = home[uprn_column] + + if pd.isnull(uprn): + uprn = None + + property_type = property_type_map.get(home[AssetList.STANDARD_PROPERTY_TYPE], None) + + searcher = SearchEpc( + address1=str(house_no), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5, + uprn=uprn + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + + # Check if we have a flat or appartment + if searcher.newest_epc is None and uprn is None: + # Try again: + if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None: + # Backup + add1 = full_address.split(",") + if len(add1) > 1: + add1 = add1[1].strip() + else: + # Try splitting on space + add1 = full_address.split(" ")[0].strip() + + else: + add1 = str(house_number) + searcher = SearchEpc( + address1=add1, + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + + if ( + "flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in + house_number.lower() + ): + searcher.ordnance_survey_client.property_type = "Flat" + + searcher.find_property(skip_os=True) + + # As a final resort, we estimate the EPC + if property_type is not None and searcher.newest_epc is None: + searcher.ordnance_survey_client.property_type = property_type + searcher.find_property(skip_os=True) + + if searcher.newest_epc is None: + no_epc.append(home[row_id_name]) + continue + + if epc_api_only: + epc = { + row_id_name: home[row_id_name], + **searcher.newest_epc.copy() + } + + epc_data.append(epc) + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + # Retrieve data from FindMyEPC + try: + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + except ValueError as e: + if "No EPC found" in str(e) and "address1" in searcher.newest_epc: + try: + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + except ValueError as e: + if "No EPC found" in str(e): + find_epc_data = {} + else: + find_epc_data = {} + except Exception as e: + raise Exception(f"Error retrieving FindMyEPC data: {e}") + time.sleep(np.random.uniform(0.1, 1)) + + epc = { + row_id_name: home[row_id_name], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"], + "find_my_epc_data": find_epc_data, + } + + epc_data.append(epc) + except Exception as e: + errors.append(home[row_id_name]) + time.sleep(5) + + return epc_data, errors, no_epc + + +def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"): + if method == "first_two_words": + asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") + return asset_list + + if method == "first_word": + asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0] + return asset_list + + if method == "house_number_extraction": + asset_list["address1_extracted"] = asset_list.apply( + lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]), + axis=1 + ) + return asset_list + + raise ValueError(f"Method {method} not recognized") + + +def app(): + """ + This app is EPC pulling data for some properties owned by Livewest + + Data request contents: + Date of last EPC + Reason for EPC + SAP score on register + Property Type + Property Area + Property Age + Any Dimensions (HLP,PW,RH) + Property Wall Construction + Heating Type + Secondary Heating + Loft Insulation Depth + + Additional if possible: + Heat loss calculations + EPC recommendations + Property UPRN + """ + + # TODO: + # For cavity work: + # - Flag any entries that have a different wall type between non-intrusive data against EPC + # - Worth double checking entries that have a difference in wall construction + # - Look at anything that is flagged as an empty cavity but the EPC data says it’s a filled cavity + # - Look at the current EPC scores - Anything that is C75 or above, especially if it’s assumed no insulation + # - By postcode, we can try and deduce if all of the addresses are a flats and then estimate if 50% of the flats + # are less than C75 + # - Flag anything pre SAP2012 + # - Flag anything over 5 years old + # - Look at year built vs age band + # + # For Solar: + # - Discount any that have solar PV - based on non-intrusives and from the inspections team + # - In the heating, discount anything that isn’t ashp, ghsp, hhrs, electric storage - possibly homes with + # electric room heaters but it might need to be an EPC E + # - Fabric - check the floor, wall and roof: + # - Filled or empty cavity is good + # - Insulated solid/timber/system built is good + # - SCIS/CEG needs solid floors + # - JJC don’t care + # - Anything with a loft 200 or below + # - Anything C75 and above won’t qualify + # - Insulated loft = 200mm + # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) + # - Or the insulation required is loft/cavity (floors should be solid) + + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" + data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx" + sheet_name = "Sheet1" + postcode_column = 'Full Address.1' + fulladdress_column = "Full Address" + address1_column = None + address1_method = "first_word" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "Build Date" + landlord_os_uprn = None + landlord_property_type = "Property Type" + landlord_wall_construction = "Wallinsul" + landlord_heating_system = "HeatSorc" + landlord_existing_pv = None + landlord_property_id = "Property Reference" + + # For Westward + # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" + # data_filename = "WESTWARD - completed list..xlsx" + # sheet_name = "Sheet1" + # postcode_column = "WFT EDIT Postcode" + # fulladdress_column = "Address" + # address1_column = None + # address1_method = "house_number_extraction" + # address_cols_to_concat = [] + # missing_postcodes_method = None + # landlord_year_built = "Build date" + # landlord_os_uprn = "UPRN" + # landlord_property_type = "Location type" + # landlord_wall_construction = "Wall Construction (EPC)" + # landlord_heating_system = "Heat Source" + # landlord_existing_pv = "PV (Y/N)" + # landlord_property_id = "Place ref" + + # Maps addresses to uprn in problematic cases + manual_uprn_map = {} + + asset_list = AssetList( + local_filepath=os.path.join(data_folder, data_filename), + header=0, + sheet_name=sheet_name, + address1_colname=address1_column, + postcode_colname=postcode_column, + landlord_property_id=landlord_property_id, + full_address_colname=fulladdress_column, + full_address_cols_to_concat=address_cols_to_concat, + missing_postcodes_method=missing_postcodes_method, + address1_extraction_method=address1_method, + landlord_year_built=landlord_year_built, + landlord_uprn=landlord_os_uprn, + landlord_property_type=landlord_property_type, + landlord_wall_construction=landlord_wall_construction, + landlord_heating_system=landlord_heating_system, + landlord_existing_pv=landlord_existing_pv + ) + asset_list.init_standardise() + + # We produce the new maps, which can be saved for future useage + + new_property_type_map = PROPERTY_MAPPING.copy().update( + asset_list.variable_mappings[asset_list.landlord_property_type] if asset_list.landlord_property_type else {} + ) + new_wall_map = WALL_CONSTRUCTION_MAPPINGS.copy().update( + asset_list.variable_mappings[asset_list.landlord_wall_construction] if + asset_list.landlord_wall_construction else {} + ) + new_heating_map = HEATING_MAPPINGS.copy().update( + asset_list.variable_mappings[asset_list.landlord_heating_system] if asset_list.landlord_heating_system else {} + ) + new_existing_pv_map = EXISTING_PV_MAPPINGS.copy().update( + asset_list.variable_mappings[asset_list.landlord_existing_pv] if asset_list.landlord_existing_pv else {} + ) + + asset_list.apply_standardiation() + + ### We retrieve the EPC data + + # We chunk up this data into 5000 rows at a time + # Create the chunks directory + force_retrieve_data = False + skip = None # Used to skip already completed chunks + chunk_size = 5000 + filename = "Chunk {i}.csv" + download_folder = os.path.join(data_folder, "Chunks") + if not os.path.exists(download_folder): + os.makedirs(download_folder) + + chunk_indexes = list(range(0, len(asset_list.standardised_asset_list), chunk_size)) + downloaded_files = {filename.format(i=i) for i in chunk_indexes} + + # We check if we have files associated to these files already and if we do, and we do not want to force the + # fetching of the data, we skip + folder_contents = os.listdir(download_folder) + if all(x in folder_contents for x in downloaded_files): + skip = max(chunk_indexes) + + for i in range(0, len(asset_list.standardised_asset_list), chunk_size): + print(f"Processing chunk {i} to {i + chunk_size}") + if skip is not None and not force_retrieve_data: + if i <= skip: + continue + chunk = asset_list.standardised_asset_list[i:i + chunk_size] + epc_data_chunk, errors_chunk, no_epc_chunk = get_data( + df=chunk, + row_id_name=asset_list.DOMNA_PROPERTY_ID, + manual_uprn_map=manual_uprn_map, + ) + + # We now retrieve any failed properties + chunk_failed = chunk[chunk[asset_list.DOMNA_PROPERTY_ID].isin(errors_chunk)] + epc_data_failed, _, _ = get_data( + df=chunk_failed, + row_id_name=asset_list.DOMNA_PROPERTY_ID, + manual_uprn_map=manual_uprn_map, + epc_api_only=False + ) + + epc_data_chunk.extend(epc_data_failed) + + # Append the failed data to the main data + # Store the chunk locally as a csv + pd.DataFrame(epc_data_chunk).to_csv(os.path.join(data_folder, f"Chunks/Chunk {i}.csv"), index=False) + # Store the errors and no-data locally + with open(os.path.join(data_folder, f"Chunks/Chunk {i} errors.json"), "w") as f: + json.dump(errors_chunk, f) + + with open(os.path.join(data_folder, f"Chunks/Chunk {i} nodata.csv"), "w") as f: + json.dump(no_epc_chunk, f) + + # We read in and concatenate the created created chunks + # List the contents + epc_data = [] + for file in downloaded_files: + csv_data = pd.read_csv(os.path.join(download_folder, file)) + # We need to convert the recommendations back to a list + csv_data["recommendations"] = csv_data["recommendations"].apply(eval) + csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval) + epc_data.append(csv_data) + + epc_df = pd.concat(epc_data) + epc_df["estimated"] = epc_df["estimated"].fillna(False) + + # We expand out the recommendations + recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]] + + unique_recommendations = set() + for _, row in recommendations_df.iterrows(): + unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) + + columns = [asset_list.DOMNA_PROPERTY_ID] + list(unique_recommendations) + transformed_data = [] + for _, row in recommendations_df.iterrows(): + # Initialize a dictionary for this row with False for all recommendations + row_data = {col: False for col in columns} + row_data[asset_list.DOMNA_PROPERTY_ID] = row[asset_list.DOMNA_PROPERTY_ID] + + # Set True for each recommendation present in this row + for rec in row["recommendations"]: + recommendation_text = rec["improvement-summary-text"] + row_data[recommendation_text] = True + + # Append the row data to transformed_data + transformed_data.append(row_data) + + transformed_df = pd.DataFrame(transformed_data) + transformed_df = transformed_df[ + [ + asset_list.DOMNA_PROPERTY_ID, "Floor insulation (solid floor)", + "Floor insulation", "Floor insulation (suspended floor)" + ] + ] + + transformed_df["epc_has_floor_recommendation"] = ( + transformed_df["Floor insulation (solid floor)"] | transformed_df["Floor insulation"] | + transformed_df["Floor insulation (suspended floor)"] + ) + + # Get the find my epc data + find_my_epc_data = epc_df[[asset_list.DOMNA_PROPERTY_ID, "find_my_epc_data"]].drop( + columns=["find_my_epc_data"]).join( + pd.json_normalize(epc_df["find_my_epc_data"]) + ) + find_my_epc_data = find_my_epc_data.merge( + transformed_df[[asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"]], + how="left", on=asset_list.DOMNA_PROPERTY_ID + ) + + # We check if we get the solar pv column: + if "Solar photovoltaics" not in find_my_epc_data.columns: + find_my_epc_data["Solar photovoltaics"] = False + + # Retrieve just the data we need + epc_df = epc_df[ + [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys()) + ].rename( + columns=asset_list.EPC_API_DATA_NAMES + ) + + epc_df = epc_df.merge( + find_my_epc_data[ + [asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + list(asset_list.FIND_EPC_DATA_NAMES.keys()) + ] + .rename(columns=asset_list.FIND_EPC_DATA_NAMES), + how="left", + on=asset_list.DOMNA_PROPERTY_ID + ) + + asset_list.merge_data(epc_df) + + asset_list.extract_attributes() + + cleaned = read_from_s3( + s3_file_name="cleaned_epc_data/cleaned.bson", + bucket_name="retrofit-data-dev" + ) + cleaned = msgpack.unpackb(cleaned, raw=False) + + # TODO: We should break out the identification of work types to flag blocks of flats specifically + asset_list.identify_worktypes(cleaned) + + pprint(asset_list.work_type_figures) + + asset_list.flat_analysis() + + # Store as an excel + filename = os.path.join(data_folder, ".".join(data_filename.split(".")[:-1])) + " - Standardised.xlsx" + # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data + + with pd.ExcelWriter(filename) as writer: + asset_list.standardised_asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False) + asset_list.flat_data.to_excel(writer, sheet_name="Flat Data", index=False) diff --git a/asset_list/mappings/exising_pv.py b/asset_list/mappings/exising_pv.py new file mode 100644 index 00000000..06e77bba --- /dev/null +++ b/asset_list/mappings/exising_pv.py @@ -0,0 +1,12 @@ +STANDARD_EXISTING_PV = { + "already has PV", "no PV", "unknown" +} + +EXISTING_PV_MAPPINGS = { + "NO": "no PV", + "YES": "already has PV", + "no": "no PV", + "yes": "already has PV", + True: "already has PV", + False: "no PV", +} diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py new file mode 100644 index 00000000..4879efcc --- /dev/null +++ b/asset_list/mappings/heating_systems.py @@ -0,0 +1,67 @@ +import numpy as np + +STANDARD_HEATING_SYSTEMS = { + "gas combi boiler", + "electric storage heaters", + "district heating", + "gas condensing boiler", + "oil boiler", + "gas condensing combi", + "air source heat pump", + "boiler - other fuel", + "ground source heat pump", + "electric radiators", + "other", + "electric boiler", + "unknown", + "communal gas boiler", + "high heat retention storage heaters", +} + +HEATING_MAPPINGS = { + "Combi - GAS": "gas combi boiler", + "E7 Storage Heaters": "electric storage heaters", + "District heating system": "district heating", + "Condensing Boiler - GAS": "gas condensing boiler", + "Boiler Oil/other": "oil boiler", + "Condensing Combi - Gas": "gas condensing combi", + "Air Source Source Heat Pump": "air source heat pump", + "Biomass Boiler": "boiler - other fuel", + "Ground Source Heat Pump": "ground source heat pump", + "Electric Oil filled radiators": "electric radiators", + "Solid Fuel": "other", + "LPG Boiler": "boiler - other fuel", + "Electric Boiler": "electric boiler", + "No data": "unknown", + "Boiler Communal/Commercial - GAS": "communal gas boiler", + "Eco Electric Radiators": "electric radiators", + "Gas fire": "other", + "Backboiler - Solid fuel": "other", + 'combi - gas': 'gas combi boiler', + 'e7 storage heaters': 'electric storage heaters', + 'district heating system': 'district heating', + 'condensing boiler - gas': 'gas condensing boiler', + 'boiler oil/other': 'oil boiler', + 'condensing combi - gas': 'gas condensing combi', + 'air source source heat pump': 'air source heat pump', + 'biomass boiler': 'boiler - other fuel', + 'ground source heat pump': 'ground source heat pump', + 'electric oil filled radiators': 'electric radiators', + 'solid fuel': 'other', + 'lpg boiler': 'boiler - other fuel', + 'electric boiler': 'electric boiler', + 'no data': 'unknown', 'boiler communal/commercial - gas': 'communal gas boiler', + 'eco electric radiators': 'electric radiators', + 'gas fire': 'other', 'backboiler - solid fuel': 'other', + 'ASHP': 'air source heat pump', + 'COMMHEAT': 'communal gas boiler', + 'GBB': 'gas combi boiler', + 'GFS': 'gas condensing boiler', + 'GWA': 'gas condensing boiler', + 'GWM': 'gas condensing combi', + 'HDU': 'district heating', + 'OILBLR': 'oil boiler', + 'SOLIDFUEL': 'boiler - other fuel', + 'STORHTR': 'electric storage heaters', + np.nan: 'unknown', +} diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py new file mode 100644 index 00000000..2612f058 --- /dev/null +++ b/asset_list/mappings/property_type.py @@ -0,0 +1,25 @@ +# These are the standard categories for property types +STANDARD_PROPERTY_TYPES = { + "house", "flat", "maisonette", "bungalow", "park home", "block house", "bedsit", "coach house", + "unknown", "other", "block of flats" +} + +# This is a basic mapping that we use to map values that we've seen commonly to standard values +PROPERTY_MAPPING = { + "HOUSE": "house", + "FLAT": "flat", + "MAISONET": "maisonette", + "BUNGALOW": "bungalow", + "BLKHOUS": "block house", + "blkhous": "block house", + "BEDSIT": "bedsit", + "COACHSE": "coach house", + "coachse": "coach house", + 'Admin Unit Type': 'unknown', + 'Block': 'block of flats', + 'Bungalow': 'bungalow', + 'Flat': 'flat', + 'House': 'house', + 'Maisonette': 'maisonette', + 'Stairwell': 'other' +} diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py new file mode 100644 index 00000000..78d64988 --- /dev/null +++ b/asset_list/mappings/walls.py @@ -0,0 +1,92 @@ +STANDARD_WALL_CONSTRUCTIONS = { + "uninsulated cavity", "filled cavity", "partial insulated cavity", "cavity unknown insulation", + "uninsulated solid brick", "insulated solid brick", "solid brick unknown insulation", + "timber frame", + "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone", + "cob", + "new build - average thermal transmittance", +} + +WALL_CONSTRUCTION_MAPPINGS = { + "New Build - Average Thermal Transmittance": "new build - average thermal transmittance", + 'Average thermal transmittance 0.25 W/m?K': 'unknown', + 'Cavity wall, as built, insulated (assumed)': 'filled cavity', + 'Average thermal transmittance 0.31 W/m?K': 'unknown', + 'Cavity wall, as built, no insulation (assumed)': 'uninsulated cavity', + 'Average thermal transmittance 0.30 W/m?K': 'unknown', 'Average thermal transmittance 0.28 W/m-¦K': 'unknown', + 'Average thermal transmittance 0.25 W/m-¦K': 'unknown', 'Average thermal transmittance 0.21 W/m-¦K': 'unknown', + 'Average thermal transmittance 0.20 W/m-¦K': 'unknown', 'Average thermal transmittance 0.29 W/m?K': 'unknown', + 'Average thermal transmittance 0.16 W/m?K': 'unknown', + 'Average thermal transmittance 0.27 W/m²K': 'unknown', + 'Average thermal transmittance 0.15 W/m-¦K': 'unknown', 'Average thermal transmittance 0.23 W/m-¦K': 'unknown', + 'Average thermal transmittance 0.18 W/m?K': 'unknown', + 'Granite or whin, with internal insulation': 'granite or whinstone', + "Granite or whinstone, as built, insulated (assumed)": "granite or whinstone", + 'Average thermal transmittance 0.22 W/m-¦K': 'unknown', 'Average thermal transmittance 0.24 W/m?K': 'unknown', + 'Average thermal transmittance 0.16 W/m-¦K': 'unknown', 'Average thermal transmittance 0.35 W/m?K': 'unknown', + 'Average thermal transmittance 0.26 W/m-¦K': 'unknown', 'Average thermal transmittance 0.62 W/m?K': 'unknown', + 'Average thermal transmittance 0.64 W/m?K': 'unknown', 'Average thermal transmittance 0.61 W/m?K': 'unknown', + 'Sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone', + 'Average thermal transmittance 0.33 W/m?K': 'unknown', + 'Cavity wall,': "cavity unknown insulation", + 'Cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity', + 'Average thermal transmittance 0.29 W/m-¦K': 'unknown', 'Average thermal transmittance 0.32 W/m-¦K': 'unknown', + 'Average thermal transmittance 0.19 W/m-¦K': 'unknown', 'Average thermal transmittance 0.27 W/m?K': 'unknown', + 'Average thermal transmittance 0.22 W/m?K': 'unknown', 'Average thermal transmittance 0.38 W/m?K': 'unknown', + 'Average thermal transmittance 0.26 W/m?K': 'unknown', 'Average thermal transmittance 0.27 W/m-¦K': 'unknown', + 'Average thermal transmittance 0.18 W/m-¦K': 'unknown', 'Average thermal transmittance = 0.27 W/m?K': 'unknown', + 'Cavity wall, with external insulation': 'filled cavity', 'Average thermal transmittance 0.21 W/m?K': 'unknown', + 'Average thermal transmittance 0.23 W/m?K': 'unknown', 'Average thermal transmittance 0.20 W/m?K': 'unknown', + 'Average thermal transmittance 0.32 W/m?K': 'unknown', 'Average thermal transmittance 0.24 W/m-¦K': 'unknown', + 'Cavity wall, with internal insulation': 'filled cavity', + 'Average thermal transmittance 0.17 W/m-¦K': 'unknown', 'Average thermal transmittance 0.28 W/m?K': 'unknown', + 'new build - average thermal transmittance': 'new build - average thermal transmittance', + 'average thermal transmittance 0.25 w/m?k': 'unknown', + 'cavity wall, as built, insulated (assumed)': 'filled cavity', + 'average thermal transmittance 0.31 w/m?k': 'unknown', + 'cavity wall, as built, no insulation (assumed)': 'uninsulated cavity', + 'average thermal transmittance 0.30 w/m?k': 'unknown', 'average thermal transmittance 0.28 w/m-¦k': 'unknown', + 'average thermal transmittance 0.25 w/m-¦k': 'unknown', 'average thermal transmittance 0.21 w/m-¦k': 'unknown', + 'average thermal transmittance 0.20 w/m-¦k': 'unknown', 'average thermal transmittance 0.29 w/m?k': 'unknown', + 'average thermal transmittance 0.16 w/m?k': 'unknown', 'average thermal transmittance 0.27 w/m²k': 'unknown', + 'average thermal transmittance 0.15 w/m-¦k': 'unknown', 'average thermal transmittance 0.23 w/m-¦k': 'unknown', + 'average thermal transmittance 0.18 w/m?k': 'unknown', + 'granite or whin, with internal insulation': 'granite or whinstone', + 'average thermal transmittance 0.22 w/m-¦k': 'unknown', 'average thermal transmittance 0.24 w/m?k': 'unknown', + 'average thermal transmittance 0.16 w/m-¦k': 'unknown', 'average thermal transmittance 0.35 w/m?k': 'unknown', + 'average thermal transmittance 0.26 w/m-¦k': 'unknown', 'average thermal transmittance 0.62 w/m?k': 'unknown', + 'average thermal transmittance 0.64 w/m?k': 'unknown', 'average thermal transmittance 0.61 w/m?k': 'unknown', + 'sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone', + 'average thermal transmittance 0.33 w/m?k': 'unknown', 'cavity wall,': "cavity unknown insulation", + 'cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity', + 'average thermal transmittance 0.29 w/m-¦k': 'unknown', 'average thermal transmittance 0.32 w/m-¦k': 'unknown', + 'average thermal transmittance 0.19 w/m-¦k': 'unknown', 'average thermal transmittance 0.27 w/m?k': 'unknown', + 'average thermal transmittance 0.22 w/m?k': 'unknown', 'average thermal transmittance 0.38 w/m?k': 'unknown', + 'average thermal transmittance 0.26 w/m?k': 'unknown', 'average thermal transmittance 0.27 w/m-¦k': 'unknown', + 'average thermal transmittance 0.18 w/m-¦k': 'unknown', 'average thermal transmittance = 0.27 w/m?k': 'unknown', + 'cavity wall, with external insulation': 'filled cavity', 'average thermal transmittance 0.21 w/m?k': 'unknown', + 'average thermal transmittance 0.23 w/m?k': 'unknown', 'average thermal transmittance 0.20 w/m?k': 'unknown', + 'average thermal transmittance 0.32 w/m?k': 'unknown', 'average thermal transmittance 0.24 w/m-¦k': 'unknown', + 'cavity wall, with internal insulation': 'filled cavity', 'average thermal transmittance 0.17 w/m-¦k': 'unknown', + 'average thermal transmittance 0.28 w/m?k': 'unknown', + 'Cavity wall, filled cavity': 'filled cavity', + 'Cavity wall, filled cavity and external insulation': 'filled cavity', + 'Granite or whinstone, as built, no insulation (assumed)': 'granite or whinstone', + 'Solid brick, as built, insulated (assumed)': 'insulated solid brick', + 'Solid brick, as built, no insulation (assumed)': 'uninsulated solid brick', + 'Solid brick, with external insulation': 'insulated solid brick', + 'Solid brick, with internal insulation': 'insulated solid brick', + 'System built, as built, insulated (assumed)': 'system built', + 'System built, as built, no insulation (assumed)': 'system built', + 'System built, with external insulation': 'system built', + 'System built, with internal insulation': 'system built', + 'Timber frame, as built, insulated (assumed)': 'timber frame', + 'Timber frame, as built, no insulation (assumed)': 'timber frame', + 'Timber frame, as built, partial insulation (assumed)': 'timber frame', + 'Timber frame, with additional insulation': 'timber frame', + 'CAVITY': 'cavity unknown insulation', + 'COMB': 'unknown', + 'NONE': 'unknown', + 'NOTKNOWN': 'unknown', + 'SOLID': 'solid brick unknown insulation', +} diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt new file mode 100644 index 00000000..fd43ac64 --- /dev/null +++ b/asset_list/requirements.txt @@ -0,0 +1,12 @@ +postal +pandas +usaddress +pydantic-settings==2.6.0 +epc-api-python==1.0.2 +fuzzywuzzy +boto3 +openpyxl +openai +tiktoken +msgpack +beautifulsoup4 \ No newline at end of file diff --git a/asset_list/tests/test_standardisation.py b/asset_list/tests/test_standardisation.py new file mode 100644 index 00000000..b6d9a391 --- /dev/null +++ b/asset_list/tests/test_standardisation.py @@ -0,0 +1,5 @@ +from asset_list.AssetList import AssetList + + +def test_multi_unit_address_flagging(): + assert AssetList._identify_multi_address('Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL') diff --git a/backend/Funding.py b/backend/Funding.py index f0780c51..2839c7ff 100644 --- a/backend/Funding.py +++ b/backend/Funding.py @@ -149,7 +149,8 @@ class Funding: :return: """ measure_table = pd.DataFrame([ - m for m in self.recommendations if m in measures and m["default"] + m for m in self.recommendations if + (m["type"] in measures) or (m["measure_type"] in measures) and m["default"] ]) measure_table["post_install_sap"] = measure_table["sap_points"] + self.starting_sap @@ -180,13 +181,10 @@ class Funding: measure_table["cost_minus_funding"] = measure_table["total"] - measure_table["estimated_funding"] measure_table["cost_minus_funding_per_sap"] = measure_table["cost_minus_funding"] / measure_table["sap_points"] measure_table = measure_table.sort_values(["cost_minus_funding_per_sap", "total"], ascending=[True, False]) - # Recommend the measure, with estimated funding amount - recommended_measure = measure_table.head(1) - return { - "measure_type": recommended_measure["measure_type"], - "estimated_funding": recommended_measure["estimated_funding"] - } + return measure_table[ + ["type", "measure_type", "Cost Savings", "estimated_funding"] + ].rename(columns={"Cost Savings": "project_score"}).to_dict("records") def sap_to_eco_band(self, sap_points): """ diff --git a/backend/Property.py b/backend/Property.py index a495431f..eaffd54d 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -395,6 +395,7 @@ class Property: primary_recommendation_id=rec["recommendation_id"], non_invasive_recommendations=self.non_invasive_recommendations, ) + self.recommendations_scoring_data.append(scoring_dict) simulation_epc = self.epc_record.prepared_epc.copy() @@ -1258,6 +1259,12 @@ class Property: if (self.building_id is not None) and (self.solar_panel_configuration is not None): return True + # If the property is in a conservation area, is listed or is a heriage building, solar panels + # become a difficult measure to generally get through planning restrictions and so we do not recommend + # solar panels + if self.restricted_measures: + return False + is_valid_property_type = self.data["property-type"] in ["House", "Bungalow", "Maisonette"] is_valid_roof_type = ( self.roof["is_flat"] or self.roof["is_pitched"] or self.roof["is_roof_room"] diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index c74a0b1f..0d921bec 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -208,9 +208,14 @@ class SearchEpc: try: # Updated regex to catch house numbers including alphanumeric ones pattern = r'(?i)(?:flat|apartment)\s*(\d+\w*)|^\s*(\d+\w*)' - match = re.search(pattern, address) - if match: - return next(g for g in match.groups() if g is not None) + match1 = re.search(pattern, address) + if match1: + return next(g for g in match1.groups() if g is not None) + + pattern2 = r'(?i)(flat|apartment)\s*([a-zA-Z]?\d+[a-zA-Z]?)' + match2 = re.search(pattern2, address) + if match2: + return match2.group(2) parsed = usaddress.parse(address) # First, try to get the 'OccupancyIdentifier' if 'OccupancyType' is detected @@ -221,7 +226,8 @@ class SearchEpc: continue if part == postcode.split(" ")[1]: continue - return part # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary + return part.rstrip( + ",") # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary # number # Fallback to 'AddressNumber' if no 'OccupancyIdentifier' is found @@ -331,6 +337,9 @@ class SearchEpc: if row["lmk-key"] not in seen and not seen.add(row["lmk-key"]) ] + if data["rows"]: + api_response["msg"] = self.SUCCESS + return api_response["msg"] def filter_rows(self, rows, property_type=None, address=None): diff --git a/backend/app/assumptions.py b/backend/app/assumptions.py index 841ec2c1..8d0c05be 100644 --- a/backend/app/assumptions.py +++ b/backend/app/assumptions.py @@ -54,4 +54,5 @@ DESCRIPTIONS_TO_FUEL_TYPES = { "Gas instantaneous at point of use": {"fuel": "Natural Gas", "cop": 0.85}, "Room heaters, wood logs": {"fuel": "Wood Logs", "cop": 1}, "Boiler and radiators, coal": {"fuel": "Coal", "cop": 0.85}, + "From main system, no cylinderstat": {"fuel": "Natural Gas", "cop": 0.85}, } diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 04a2ef7f..d82e774b 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -338,7 +338,7 @@ def extract_property_request_data( # Because we have some non-invasive recommendations that match on address and postcode, but not UPRN # we need to check existence of uprn - has_uprn = "uprn" in non_invasive_recommendations[0] if non_invasive_recommendations else True + has_uprn = "uprn" in non_invasive_recommendations[0] if non_invasive_recommendations else False if has_uprn: has_uprn = non_invasive_recommendations[0]["uprn"] not in ["", None] @@ -370,7 +370,7 @@ def extract_property_request_data( property_non_invasive_recommendations["recommendations"] = str(transformed) # Check if the valuation data has uprn - valuation_has_uprn = "uprn" in valuation_data[0] if valuation_data else True + valuation_has_uprn = "uprn" in valuation_data[0] if valuation_data else False if valuation_has_uprn: valuation_has_uprn = valuation_data[0]["uprn"] not in ["", None] @@ -639,8 +639,10 @@ async def trigger_plan(body: PlanTriggerRequest): recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data) recommendations_scoring_data = recommendations_scoring_data.drop( - columns=["rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending", - "carbon_ending"] + columns=[ + "rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending", + "carbon_ending" + ] ) all_predictions = await model_api.async_paginated_predictions( @@ -692,7 +694,8 @@ async def trigger_plan(body: PlanTriggerRequest): Recommendations.calculate_recommendation_tenant_savings( property_instance=property_instance, kwh_simulation_predictions=kwh_simulation_predictions, - property_recommendations=property_recommendations + property_recommendations=property_recommendations, + ashp_cop=body.ashp_cop ) ) property_instance.current_energy_bill = property_current_energy_bill @@ -822,7 +825,7 @@ async def trigger_plan(body: PlanTriggerRequest): property_recommendations=recommendations[p.id], project_scores_matrix=eco_project_scores_matrix, whlg_eligible_postcodes=whlg_eligible_postcodes, - gbis_abs_rate=20, + gbis_abs_rate=15, eco4_abs_rate=15, ) funding_calulator.check_eligibiltiy() diff --git a/backend/app/plan/schemas.py b/backend/app/plan/schemas.py index f84912fe..618bec90 100644 --- a/backend/app/plan/schemas.py +++ b/backend/app/plan/schemas.py @@ -80,3 +80,5 @@ class PlanTriggerRequest(BaseModel): multi_plan: Optional[bool] = False optimise: Optional[bool] = True default_u_values: Optional[bool] = True + + ashp_cop: Optional[float] = 2.8 diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py index 720005d3..6d4852b2 100644 --- a/backend/ml_models/Valuation.py +++ b/backend/ml_models/Valuation.py @@ -1,5 +1,4 @@ import numpy as np -from scipy.constants import value class PropertyValuation: @@ -216,6 +215,30 @@ class PropertyValuation: cls.UPRN_VALUE_LOOKUP.get(property_instance.uprn) ) + current_epc = property_instance.data["current-energy-rating"] + + if not current_value: + return { + "current_value": 0, + "lower_bound_increased_value": 0, + "upper_bound_increased_value": 0, + "average_increased_value": 0, + "average_increase": 0 + } + + return cls.estimate_valuation_improvement(current_value, current_epc, target_epc, total_cost) + + @classmethod + def estimate_valuation_improvement(cls, current_value, current_epc, target_epc, total_cost=None): + """ + This function estimates the value of a property based on the current EPC rating and the target EPC rating + :param current_value: + :param current_epc: + :param target_epc: + :param total_cost: + :return: + """ + if not current_value: return { "current_value": 0, @@ -225,7 +248,6 @@ class PropertyValuation: "average_increase": 0 } - current_epc = property_instance.data["current-energy-rating"] # We get the spectrum of ratings between the current and target EPC epc_band_range = cls.EPC_BANDS[cls.EPC_BANDS.index(current_epc): cls.EPC_BANDS.index(target_epc) + 1] diff --git a/backend/tests/test_search_epc.py b/backend/tests/test_search_epc.py index 3b2e2a5b..562585ad 100644 --- a/backend/tests/test_search_epc.py +++ b/backend/tests/test_search_epc.py @@ -48,3 +48,12 @@ class TestSearchEpcIntegration: assert epc_searcher.newest_epc["lmk-key"] == lmk_key assert epc_searcher.newest_epc["uprn"] == uprn assert len(epc_searcher.older_epcs) == n_old_epcs + + def test_search_housenumber(self): + eg1 = 'Flat A11, Mortimer House, Grendon Road, Exeter' + res1 = SearchEpc.get_house_number(eg1, None) + assert res1 == "A11" + + eg2 = 'Flat A9, Mortimer House, Grendon Road, Exeter, EX1 2NL' + res2 = SearchEpc.get_house_number(eg2, None) + assert res2 == "A9" diff --git a/etl/customers/l_and_g/ic_slides.py b/etl/customers/l_and_g/ic_slides.py index 72dfc2c0..a5cb3511 100644 --- a/etl/customers/l_and_g/ic_slides.py +++ b/etl/customers/l_and_g/ic_slides.py @@ -132,7 +132,7 @@ def get_data(portfolio_id, scenario_ids): return properties_data, plans_data, recommendations_data -properties_data, plans_data, recommendations_data = get_data(portfolio_id=124, scenario_ids=[199]) +properties_data, plans_data, recommendations_data = get_data(portfolio_id=124, scenario_ids=[205]) properties_df = pd.DataFrame(properties_data) plans_df = pd.DataFrame(plans_data) @@ -240,4 +240,7 @@ df["Predicted Post Works SAP"] = df["Current SAP Points"] + df["sap_points"] df["Predicted Post Works SAP"] = df["Predicted Post Works SAP"].round() df["Predicted Post Works EPC"] = df["Predicted Post Works SAP"].apply(lambda x: sap_to_epc(x)) +df["Recommendation: Air Source Heat Pump"].sum() +df["Cost: Air Source Heat Pump"].sum() + df.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/Basildon Data Export - 2.csv", index=False) diff --git a/etl/customers/lambeth/re-knocks.py b/etl/customers/lambeth/re-knocks.py new file mode 100644 index 00000000..1de91b50 --- /dev/null +++ b/etl/customers/lambeth/re-knocks.py @@ -0,0 +1,23 @@ +import pandas as pd + +data = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/Lambeth Reknocks.xlsx", sheet_name="Possible Route", + header=1 +) + +data["Outcomes"].value_counts() + +# Strip out: No + +df = data[data["Outcomes"] == "See notes"] +notes_df = df[ + ("Notes (If 'no answer' under outcomes, have you checked around the property for access issues where " + "possible?)")].value_counts().to_frame() + +example = df[df["Notes (If 'no answer' under outcomes, have you checked around the property for access issues where " + "possible?)"] == ('Access to rear of property only through number 10. Overgrown athe rear of property ' + 'installer wont be able to access') + ] + +# 18 did not attend +# diff --git a/etl/customers/panacap/assets.py b/etl/customers/panacap/assets.py new file mode 100644 index 00000000..ec57d9a4 --- /dev/null +++ b/etl/customers/panacap/assets.py @@ -0,0 +1,61 @@ +import os + +import pandas as pd +from dotenv import load_dotenv + +from etl.spatial.OpenUprnClient import OpenUprnClient +from etl.route_march_data_pull.app import get_data + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + +addresses = [ + {"address": "3 Willis Road", "postcode": "CB1 2AQ"}, + {"address": "22 Catharine Street", "postcode": "CB1 3AW"}, + {"address": "332 Mill Road", "postcode": "CB1 3NN"}, + {"address": "330 Mill Road", "postcode": "CB1 3NN"}, + {"address": "328 Mill Road", "postcode": "CB1 3NN"}, + {"address": "71 Mill Road", "postcode": "CB1 2AS"}, + {"address": "78 Argyle Street", "postcode": "CB1 3LZ"}, + {"address": "9 Graham Road", "postcode": "CB4 2ZE"}, + {"address": "217 Mill Road", "postcode": "CB1 3BE"}, + {"address": "374 Mill Road", "postcode": "CB1 3NN"}, + {"address": "174 Thoday Street", "postcode": "CB1 3AX"}, + {"address": "37 Abbey Road", "postcode": "CB5 8HH"}, + {"address": "18 Upper Gwydir Street", "postcode": "CB1 2LR"}, + {"address": "21 Fulbourn Road Fulbourn", "postcode": "CB1 9JL"}, + {"address": "108 Argyle Street", "postcode": "CB1 3LS"}, + {"address": "115 Victoria Road", "postcode": "CB4 3BS"}, + {"address": "55 Ross Street", "postcode": "CB1 3BP"}, + {"address": "16 Kingston Street", "postcode": "CB1 2NU"}, + {"address": "13 Thoday Street", "postcode": "CB1 3AS"}, + {"address": "103 York Street", "postcode": "CB1 2PZ"}, +] + +asset_list = pd.DataFrame(addresses) +asset_list["row_id"] = asset_list.index + +epc_data, _, _ = get_data( + asset_list=asset_list, fulladdress_column="address", postcode_column="postcode", address1_column="address", + manual_uprn_map={}, epc_api_only=True +) + +epc_df = pd.DataFrame(epc_data) +epc_df.shape + +asset_list = asset_list.merge( + epc_df, how="left", on="row_id" +) + +asset_list = asset_list.rename(columns={"address_x": "Address", "postcode_x": "Postcode"}) +asset_list["uprn"] = asset_list["uprn"].astype(str) + +spatial_data = OpenUprnClient.get_spatial_data([x["uprn"] for x in epc_data], bucket_name="retrofit-data-dev") +spatial_data["UPRN"] = spatial_data["UPRN"].astype(str) + +asset_list = asset_list.merge( + spatial_data, how="left", left_on="uprn", right_on="UPRN" +) + +asset_list.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Panacap/Acquisitions EPC Data.csv", + index=False) diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index 13cdc41b..fc3b7ec6 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 126 +PORTFOLIO_ID = 134 USER_ID = 8 load_dotenv(dotenv_path="backend/.env") @@ -19,22 +19,24 @@ def app(): asset_list = [ { - "address": "Garden Flat, 48 Bedminster Parade", - "postcode": "BS3 4HS", - "building_id": 1, - "uprn": 308249, + "address": "Flat 2, 42 Malden Road, London NW5 3HG", + "postcode": "NW5 3HG", + "uprn": 5117165, }, { - "address": "Top Floor Flat, 48 Bedminster Parade", - "postcode": "BS3 4HS", - "building_id": 1, - "uprn": 308251 + "address": "15 Bournville Lane", + "postcode": "B30 2JY", + "uprn": 100070301128 }, { - "address": "First Floor Flat, 48 Bedminster Parade", - "postcode": "BS3 4HS", - "building_id": 1, - "uprn": 308250, + "address": "34 Bournville Lane", + "postcode": "B30 2LN", + "uprn": 100070301140 + }, + { + "address": "36 Bournville Lane", + "postcode": "B30 2LN", + "uprn": 100070301142 } ] asset_list = pd.DataFrame(asset_list) @@ -65,20 +67,21 @@ def app(): valuation_data = [ { - "address": "Garden Flat, 48 Bedminster Parade", - "postcode": "BS3 4HS", - "valuation": 337_000 + "uprn": 5117165, + "valuation": 467_000 }, { - "addresss": "Top Floor Flat, 48 Bedminster Parade", - "postcode": "BS3 4HS", - "valuation": 337_000 + "uprn": 100070301128, + "valuation": 335_000 }, { - "address": "First Floor Flat, 48 Bedminster Parade", - "postcode": "BS3 4HS", - "valuation": 337_000 - } + "uprn": 100070301140, + "valuation": 276_000 + }, + { + "uprn": 100070301142, + "valuation": 276_000 + }, ] # Store valuation data to s3 valuation_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuation.csv" diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index 8538188b..95fe4fcd 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -1,4 +1,7 @@ import os +from urllib import parse +from fuzzywuzzy import fuzz + import PyPDF2 import re import pandas as pd @@ -128,6 +131,7 @@ def extract_summary_report(pdf_path): "Current SAP Rating": None, "Current EPC Band": None, "Fuel Bill": None, + "Main Building Age Band": None, "Number of Storeys": None, "Window Age Description": None, "Window Age Description Proportion (%)": None, @@ -177,6 +181,13 @@ def extract_summary_report(pdf_path): sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) data["Current SAP Rating"] = sap_match.group(1).split(" ")[1] + # Extract age + age_band_match = re.search( + r"3\.0 Date Built:\s*Main Property\s*[A-Z]?\s*(\d{4}-\d{4}|before \d{4}|\d{4} onwards)", + text + ) + data["Main Building Age Band"] = age_band_match.group(1) + # Number of storeys storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) data["Number of Storeys"] = int(storeys_match.group(1)) @@ -465,7 +476,11 @@ def extract_building_parts_summary(text): r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL ) if not dimensions_section: - raise ValueError("Failed to locate dimensions section in the text.") + dimensions_section = re.search( + r"Dimensions:\s*Dimension type: External\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL + ) + if not dimensions_section: + raise ValueError("Failed to locate dimensions section in the text.") dimensions_text = dimensions_section.group(1) @@ -694,6 +709,7 @@ def extract_epr(pdf_path): "Primary Energy Use (kWh/yr)": None, "Primary Energy Use Intensity (kWh/m2/yr)": None, "Number of Storeys": None, + "Main Building Age Band": None, "Fuel Bill": None, "Window Age Description": None, "Window Age Description Proportion (%)": None, @@ -747,12 +763,38 @@ def extract_epr(pdf_path): # Extract Current and Potential SAP ratings sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text) - current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2)) - data["Current SAP Rating"] = current_sap + if sap_match is None: + # Handles the older format of the elmhurst EPR + # The text will look something like this: + # Least energy efficient - higher running costsD 61 - we extract D 61 + sap_match = re.search( + r"(?P[A-G])\s(?P\d{1,3})(?P[A-G])\s(?P\d{1,3})", + text) + data["Current EPC Band"] = sap_match.group("current_epc") + data["Current SAP Rating"] = int(sap_match.group("current_sap")) + else: + current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2)) + data["Current SAP Rating"] = current_sap # Extract the primary energy use intensity additional_rating_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text) - data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1)) + if additional_rating_match: + data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1)) + else: + # Handles the older format of the Elmhurst EPR + primary_energy_match = re.search(r"actual consumption\.\n(?P\d+)", text) + data["Primary Energy Use (kWh/yr)"] = int(primary_energy_match.group("primary_energy")) + # We calculate the primary energy use intensity by dividing by floor area + floor_area = re.search(r"Total Floor Area\s(?P\d+)\s?m2", text).group("floor_area") + data["Primary Energy Use Intensity (kWh/m2/yr)"] = data["Primary Energy Use (kWh/yr)"] / int(floor_area) + + # Extract age band + age_band_match = re.search( + r"Building part:\s*Main\s*-\s*built in\s*(?:[A-Z]\s*)?(\d{4}-\d{4}|before \d{4}|\d{4} onwards)", + text + ) + + data["Main Building Age Band"] = age_band_match.group(1) # Extract Number of Storeys storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) @@ -880,11 +922,18 @@ def detect_report_type(pdf_path, pdf_file): """ # Attempt to read the first page of the PDF to determine type with open(pdf_path, "rb") as file: + # This code raises some warnings like Multiple definitions in dictionary at byte 0x1ab for key /Filter + # This is because the pdf is irregular. We could possibly try a library like fitz to handle this reader = PyPDF2.PdfReader(file) first_page_text = reader.pages[0].extract_text() if reader.pages else "" + n_pages = len(reader.pages) - if is_energy_report(first_page_text): + if is_energy_report(first_page_text) and n_pages > 3: + # The EPR should have more than 3 pages return "epr" + elif is_energy_report(first_page_text) and n_pages <= 3: + # This is a shortened version of the EPR which isn't massively useful + return "short_form_epr" elif "summary" in pdf_file.lower() or is_summary_report(first_page_text): return "summary" elif is_condition_report(first_page_text): @@ -1675,7 +1724,6 @@ def append_stonewater_id(): ) model_proposed_sample = model_proposed_sample[~pd.isnull(model_proposed_sample["Address ID"])] model_proposed_sample["Address ID"] = model_proposed_sample["Address ID"].astype(int) - z = model_proposed_sample["Archetype ID"].drop_duplicates().sort_values() original_archetypes = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " @@ -2906,6 +2954,14 @@ def identify_incorrect_packages(): ) +def extract_sharepoint_url(x): + if pd.isnull(x): + return "" + return "/".join(parse.urlparse( + x.split(" - http")[1] + ).path.replace("%20", " ").split("/")[-2:]) + + def revised_model(): """ This function implements the revised model for Stonewater, where we are looking at new priority postcodes @@ -2913,7 +2969,6 @@ def revised_model(): """ # 1) Create the new list of properties - new_priority_postcodes = pd.read_excel( "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Jan 2025 Project/Updated 2025 to 2030 " "priority list.xlsx" @@ -2927,16 +2982,1312 @@ def revised_model(): original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])] original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"] original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int) + original_archetypes["UPRN"] = original_archetypes["UPRN"].astype("Int64").astype(str) - original_archetypes = original_archetypes[ - ["Address ID", "Archetype ID", ""] - ] + wave_21_folder_name = "Wave 2.1 Surveys - 2" # Check if we have all of the addresses missed = original_archetypes[ ~original_archetypes["Address ID"].isin(new_priority_postcodes["Address ID"].values) ]["Archetype ID"].unique() - assert + + assert set(missed) == {'NOT PRIORITY POSTCODE', 'IN WAVE 2.1', 'EPC C OR ABOVE'} + + original_archetypes = original_archetypes[ + ["Address ID", "Archetype ID", "Archetype Group Rank", "UPRN"] + ] + + # Merge these archetypes on to the new priority postcodes + new_priority_postcodes = new_priority_postcodes.merge( + original_archetypes, how="left", on="Address ID" + ) + + # Basic check, should have no rows with missing Archetype ID, where + assert float(new_priority_postcodes[pd.isnull(new_priority_postcodes["Archetype ID"])]["Address ID"].isin( + original_archetypes["Address ID"] + ).sum()) == 0 + + # We pull together the survey data sheet + survey_folders = [] + + # Loop over each survey folder and list its contents + for i in range(1, NUM_FOLDERS + 1): + folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}") + if os.path.isdir(folder_path): # Check if folder exists + folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)] + survey_folders.extend(folder_contents) # Append contents to the master list + + wave_21_folders = [ + "1. Herefordshire", + "2. Bedfordshire", + "3. Wiltshire", + "4. Bournemouth", + "5. Coventry", + "6. West Sussex", + "7. Dorset", + "8. Cambridgeshire", + "9. Guildford", + "10. Little Island", + "11. CCS Dorset" + ] + + for wave_2_1_folder in wave_21_folders: + folder_path = os.path.join(CUSTOMER_FOLDER_PATH, wave_21_folder_name, wave_2_1_folder) + if os.path.isdir(folder_path): # Check if folder exists + folder_contents = [os.path.join(wave_21_folder_name, wave_2_1_folder, file) for file in + os.listdir(folder_path)] + survey_folders.extend(folder_contents) # Append contents to the master list + + # We now do a large pull of all of the data + extracted_data = [] + mtp_extracted_data = [] # Additional data to extract from the medium term plans + for survey_folder in tqdm(survey_folders): + survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) + + # Check that the survey folder is actually a folder + if not os.path.isdir(survey_folder_path): + continue + + # List the folders inside of the survey folder + survey_subfolders = [ + name for name in os.listdir(survey_folder_path) + if os.path.isdir(os.path.join(survey_folder_path, name)) + ] + + # Check if there's a "retrofit assessment" folder + retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None) + + ra_folder = next( + (name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()), + None + ) + + mtp_folder = next( + (name for name in survey_subfolders if "mid-term" in name.lower() or "mtp" in name.lower()), + None + ) + if mtp_folder: + # We have a mid term plan: + mtp_folder_path = os.path.join(survey_folder_path, mtp_folder) + # Get the contents - files and not folder + mtp_contents = [ + os.path.join(mtp_folder, file) for file in os.listdir(mtp_folder_path) + if ".DS_Store" not in file and not os.path.isdir(os.path.join(mtp_folder_path, mtp_folder, file)) + ] + + has_v1 = [ + f for f in mtp_contents if "v1" in f.lower() or "/ss" in f.lower() + ] + + if has_v1: + # Then we go one level deeper + mtp_contents = [ + os.path.join(has_v1[0], f) for f in + os.listdir(os.path.join(survey_folder_path, has_v1[0])) + ] + + # We check the the IMA + for file_name in mtp_contents: + + filepath = os.path.join(survey_folder_path, file_name) + # We expect a pdf so try and parse it + try: + with open(filepath, "rb") as file: + reader = PyPDF2.PdfReader(file) + # Just the first page + text = reader.pages[0].extract_text() + + except Exception as e: + continue + + # We check if this is an IMA + ima_heading_search = re.search( + r"Improvement measure\s+Capital Cost\s+Lifetime of\s*\n\s*measureFuel saving\s*Lifetime fuel", text + ) + + is_ima = bool(ima_heading_search) + if not is_ima: + continue + + # Otherwise, extract: RIR, PV + pv_search = re.search(r"PV \(\d+Kwp\)", text) + has_pv = bool(pv_search) + pv_system = pv_search.group(0) if has_pv else None + + # We perform a second search for PV: + if pv_search is None: + pv_search = re.search("solar pv", text.lower()) + has_pv = bool(pv_search) + pv_system = "Solar PV" if has_pv else None + + rir_search = re.search(r"RIR \(\d+(\.\d+)?\)", text) + has_rir = bool(rir_search) + rir_spec = rir_search.group(0) if has_rir else None + + mtp_extracted_data.append({ + "survey_folder": survey_folder, + "has_pv": has_pv, + "PV System": pv_system, + "RIR Specification": rir_spec, + "has_rir": has_rir + }) + continue + + # If retrofit assessment folder exists, check if it has content + if retrofit_folder or ra_folder: + if retrofit_folder: + retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) + else: + retrofit_folder_path = os.path.join(survey_folder_path, ra_folder) + + # Check if everything inside is a sub-folder and the number of folders is 2 + items = [item for item in os.listdir(retrofit_folder_path) if item != '.DS_Store'] + all_folders = [os.path.isdir(os.path.join(retrofit_folder_path, item)) for item in items] + if all(all_folders) and len(all_folders) == 2 and "Property Pics" in items: + # Get the folder that isn't Property Pics + retrofit_folder_path = os.path.join( + retrofit_folder_path, [item for item in items if item != "Property Pics"][0] + ) + + if os.listdir(retrofit_folder_path): # If not empty + summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path) + if summary_data: + summary_data = { + "survey_folder": survey_folder, + **summary_data, + } + extracted_data.append(summary_data) + continue + else: + # Then we have an empty Retrofit Assessment folder + continue + + # If no retrofit folder or it was empty, check files in survey_folder + summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) + if not summary_data: + if len(survey_subfolders) == 1: + survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0]) + summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) + + if summary_data: + summary_data = { + "survey_folder": survey_folder, + **summary_data, + } + extracted_data.append(summary_data) + + retrofit_assessment_data = pd.DataFrame(extracted_data) + mtp_df = pd.DataFrame(mtp_extracted_data) + + # Save + # retrofit_assessment_data.to_csv( + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 5.csv"), index=False + # ) + # mtp_df.to_csv( + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 5.csv"), index=False + # ) + retrofit_assessment_data = pd.read_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 5.csv"), + ) + mtp_df = pd.read_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 5.csv"), + ) + + # There are a few duplicates we just manually drop + mtp_df = mtp_df.drop_duplicates() + mtp_df = mtp_df[ + ~(( + mtp_df["survey_folder"] == "Wave 2.1 Surveys - 2/1. Herefordshire/(043) Manor Fields 27" + ) & (~mtp_df["has_pv"])) + ] + + mtp_df = mtp_df[ + ~(( + mtp_df["survey_folder"] == "Wave 2.1 Surveys - 2/2. Bedfordshire/(147) Gilpin Close 5" + ) & (~mtp_df["has_pv"])) + ] + + # Remove some definite duplicates + dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].duplicated()]["Address"] + dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].isin(dupes)] + dupes = dupes.sort_values("Address") + # Get all of the folders that end with ROSS + to_drop = dupes[dupes["survey_folder"].str.endswith("ROSS")]["survey_folder"].unique().tolist() + + # Replace \n with "" + retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "") + + retrofit_assessment_data = retrofit_assessment_data[ + ~retrofit_assessment_data["survey_folder"].isin( + [ + "StonewaterSurveys_10/4 Beech Road, LUTON, LU1 1DP ROSS", + "StonewaterSurveys_2/135 Runley Road, LUTON, LU1 1TX ROSS", + "StonewaterSurveys_13/7 Saxon Road, LUTON, LU3 1JR ROSS" + ] + to_drop + ) + ] + + retrofit_assessments_data_columns = [ + 'Current SAP Rating', 'Current EPC Band', 'Primary Energy Use (kWh/yr)', + 'Primary Energy Use Intensity (kWh/m2/yr)', 'Number of Storeys', + 'Fuel Bill', 'Window Age Description', + 'Window Age Description Proportion (%)', + 'Secondary Window Age Description', + 'Secondary Window Age Description Proportion (%)', 'Number of Windows', + 'Total Number of Doors', 'Number of Insulated Doors', + 'Existing Primary Heating System', + 'Existing Primary Heating PCDF Reference', + 'Existing Primary Heating Controls', + 'Existing Primary Heating % of Heat', + 'Existing Secondary Heating System', + 'Existing Secondary Heating PCDF Reference', + 'Existing Secondary Heating Controls', + 'Existing Secondary Heating % of Heat', 'Secondary Heating Code', + 'Water Heating Code', 'Total Floor Area (m2)', + 'Total Ground Floor Area (m2)', 'RIR Floor Area', + 'Main Building Wall Area (m2)', 'First Extension Wall Area (m2)', + 'Number of Light Fittings', 'Number of LEL Fittings', + 'Number of fittings needing LEL', 'Main Roof Type', + 'Main Roof Insulation', 'Main Roof Insulation Thickness', + 'Main Wall Type', 'Main Wall Insulation', 'Main Wall Dry-lining', + 'Main Wall Thickness', 'Main Building Alternative Wall Type', + 'Main Building Alternative Wall Insulation', + 'Main Building Alternative Wall Dry-lining', + 'Main Building Alternative Wall Thickness', + 'Main Fuel', + 'Main Building Age Band', + ] + # For the columns in retrofit_assessments_data_columns, prefix all of them with Survey: + retrofit_assessments_data_columns_prefixed = ["Survey: " + x for x in retrofit_assessments_data_columns] + rename_dict = dict(zip(retrofit_assessments_data_columns, retrofit_assessments_data_columns_prefixed)) + retrofit_assessment_data = retrofit_assessment_data.rename(columns=rename_dict) + retrofit_assessment_data["Survey: Current EPC Band"] = ( + retrofit_assessment_data["Survey: Current SAP Rating"].apply(lambda x: sap_to_epc(x)) + ) + + # We can read in the data as needed + + # Next Step: Read in the coordinated measures and match to the extracted data + ############################################################ + # CCS + ############################################################# + ccs_coordination_sheet = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, + "Jan 2025 Project", + "CCS_Installation_Compliance_-_Stonewater_SHDF_2_1_1738228227.xlsx" + ), + header=4 + ) + ccs_postcodes = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "CCS_Installation_Compliance_CCS.xlsx" + ), + header=4 + ) + ccs_coordination_sheet = ccs_postcodes[['Name', 'Post Code', 'Asset ID', 'Asset ID.1']].merge( + ccs_coordination_sheet, how="left", on="Name" + ) + ccs_coordination_sheet = ccs_coordination_sheet[~pd.isnull(ccs_coordination_sheet["Name"])] + ccs_coordination_sheet["contractor"] = "CCS" + # We split ccs into two sections - the first being + ccs_coordination_removed_from_programme = ccs_coordination_sheet.tail(21) + ccs_coordination_sheet = ccs_coordination_sheet.head(87) + ccs_coordination = pd.concat([ccs_coordination_removed_from_programme, ccs_coordination_sheet]) + + ccs_coordination["folder_path"] = ccs_coordination["Sharepoint Link"].apply(lambda x: extract_sharepoint_url(x)) + + ############################################################ + # WATES + ############################################################# + wates_coordination_sheet = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "Stonewater_SAP_Installation_Compliance_1738229226.xlsx" + ), + header=4 + ) + wates_postcodes = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "Stonewater_SAP_Installation_Compliance_Vinci-Wates.xlsx" + ), + header=4 + ) + wates_postcodes = wates_postcodes[~pd.isnull(wates_postcodes["Post Code"])] + wates_coordination_sheet = wates_coordination_sheet.merge( + wates_postcodes[['Name', 'Post Code', 'Asset ID']].drop_duplicates(), how="left", on="Name" + ) + + wates_coordination_sheet["contractor"] = "Wates" + # Break into the different sites: + # Wiltshire + wates_coordination_sheet_wiltshere = wates_coordination_sheet.head(267) + wates_coordination_sheet_herefordshire = wates_coordination_sheet.iloc[271:332, :] + wates_coordination_sheet_coventry = wates_coordination_sheet.iloc[336:409, :] + wates_coordination_sheet_bedfordshire = wates_coordination_sheet.iloc[413:520, :] + wates_coordination_sheet_bournemouth = wates_coordination_sheet.iloc[524:567, :] + wates_coordination_sheet_cambridgeshire = wates_coordination_sheet.iloc[571:581, :] + wates_coordination_sheet_removed_from_programme = wates_coordination_sheet.iloc[586:926, :] + wates_coordination_sheet_abeyance = wates_coordination_sheet.iloc[930:972, :] + + wates_coordination = pd.concat( + [ + wates_coordination_sheet_wiltshere, + wates_coordination_sheet_herefordshire, + wates_coordination_sheet_coventry, + wates_coordination_sheet_bedfordshire, + wates_coordination_sheet_bournemouth, + wates_coordination_sheet_cambridgeshire, + wates_coordination_sheet_removed_from_programme, + wates_coordination_sheet_abeyance + ] + ) + # We correct the Asset ID for 34 Kempster Close + wates_coordination["Asset ID"] = np.where( + wates_coordination["Name"] == "34 Kempster Close", + "12005", + wates_coordination["Asset ID"] + ) + + # We fill the missing ids + missing_lookup = { + "4 Sydnall Fields": 31231, + "12 Sydnall Fields": 31239, + "12 Athena Gardens": 28061, + "49 Banner Lane": 41189, + "4 Jonathan Road": 41232, + "8 Jonathan Road": 41236, + "1 Jonathan Road": 41229, + "96 Taunton Way": 31417, + "94 Taunton Way": 31418, + "1 Lady Lane": 29430, + "10 Jonathan Road": 41283, + "21 Jonathan Road": 41246, + "12 Ashcroft Close": 26399 + } + for name, asset_id in missing_lookup.items(): + wates_coordination["Asset ID"] = np.where( + wates_coordination["Name"] == name, + asset_id, + wates_coordination["Asset ID"] + ) + + wates_coordination = wates_coordination[~pd.isnull(wates_coordination["Asset ID"])] + + wates_coordination["folder_path"] = wates_coordination["Sharepoint Folder"].apply( + lambda x: extract_sharepoint_url(x) + ) + + ############################################################ + # NEW 450 COORDINATED RETROFIT ASSESSMENTS + ############################################################# + features = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - " + "master sheet.csv", + encoding='latin1' + ) + features["Address ID"] = features["Address ID"].astype(str).astype(int) + features_to_merge = features[["Address ID", "Organisation Reference"]] + + retrofit_packages_board = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, + "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1732034933 Final 19.11.24.xlsx" + ), + header=4 + ) + retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])] + # Take just the rows that have been surveyed + retrofit_packages_board = retrofit_packages_board[ + retrofit_packages_board["RA"].isin(["Invoiced", "Completed"]) + ] + + retrofit_packages_board = retrofit_packages_board.merge( + features_to_merge, how="left", on="Address ID" + ) + + manual_filters = { + "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD", + "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG", + "2 Bromyard Road": "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ", + 'Flat 18, 1 Raglan Court': "StonewaterSurveys_13/60-3-18 Raglan Court, 1 Raglan Court-MK41 8QT", + '14 Raglan Court, 1 Devizes Avenue': 'StonewaterSurveys_12/55-3-14 Raglan Court, Devizes Avenue-MK41 8QT', + '19 South Road': 'StonewaterSurveys_4/19 The Oaks, South Road, SMETHWICK, B67 7BY', + 'Flat 12 Pelican Lane': 'StonewaterSurveys_1/121-3-Flat 12 Lynton Court, Pelican Lane-RG14 1NN', + 'Flat C, 44 St Leonards Avenue': 'StonewaterSurveys_11/427-2-44c St. Leonards Avenue-MK42 0RB', + '16 The Crescent, Kington': 'StonewaterSurveys_9/360-3-16 The Crescent-HR5 3AS', + '2 School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA', + '14 South Road': 'StonewaterSurveys_2/14 The Oaks, South Road, SMETHWICK, B67 7BY', + '1 Groves Street': 'StonewaterSurveys_4/19-5-1 Groves Street-SN2 2BW', + '2 Calshot Walk': 'StonewaterSurveys_3/156-3-2 Calshot Walk-MK41 8QS', + '21 Constitution Hill': 'StonewaterSurveys_1/112-11-21 Constitution Hill-BH14 0PX', + '22 Constitution Hill': 'StonewaterSurveys_4/185-8-22 Constitution Hill-BH14 0PX', + '2 Marches Cottages, School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA', + '26, Copthorn House, Brighton Road': 'StonewaterSurveys_15/133-1-26 Brighton Road-KT20 6BQ', + '4, Old St Marys, Ripley Lane': "StonewaterSurveys_15/433-3-4 Ripley Lane-KT24 6JG", + '1 Nelson House, Short Street': 'StonewaterSurveys_15/89-2-1 Short Street-GU11 1HX', + "18 Nelson House, Short Street": 'StonewaterSurveys_15/25-3- 18 Short Street- GU11 1HX', + '3 Nelson House, Short Street': 'StonewaterSurveys_2/138-1-3 Short Street-GU11 1HX', + '16, Copthorn House, Brighton Road': 'StonewaterSurveys_13/78-3-16 Brighton Road-KT20 6BQ', + '20 Nelson House, Short Street': 'StonewaterSurveys_15/89-1-20 Short Street-GU11 1HX', + '7 Croft Street': 'StonewaterSurveys_8/333-2-7 Croft Street-HR6 8LA' + } + + # We now match this retrofit packages board to the extracted data + matching_lookup = [] + for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)): + + # Handle the case that has the wrong postcode in the asset data + if home["Name"] in manual_filters: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["survey_folder"] == manual_filters[home["Name"]] + ].copy() + else: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower() + ].copy() + + # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False + ) + if to_filter.sum() == 0: + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.replace(",", "").str.replace(".", + "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False + ) + filtered = filtered[to_filter] + + if filtered.empty: + continue + + if filtered.shape[0] == 1: + matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Address ID": home["Address ID"], + "Name": home["Name"] + } + ) + continue + + # home["Name"] should be contained in the survey_folder + filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)] + # We have an edge case wher some properties have two outputs in Sharepoint + if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": + raise Exception("Fix me1") + # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] + + if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB': + raise Exception("Fix me2") + # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] + + if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ': + filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"] + + if filtered.empty: + continue + if filtered.shape[0] != 1: + raise Exception("something went wrong") + + matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Address ID": home["Address ID"], + "Name": home["Name"] + } + ) + matching_lookup = pd.DataFrame(matching_lookup) + + ccs_coordination = ccs_coordination.rename( + columns={"Post Code": "Postcode"} + ) + ccs_coordination = ccs_coordination[~pd.isnull(ccs_coordination["Postcode"])] + ccs_coordination = ccs_coordination[ccs_coordination["Retrofit Assessment"] != "Outstanding"] + + ccs_manual_filters = { + "35 Kittiwake Close": f"{wave_21_folder_name}/11. CCS Dorset/Kittiwake Close 35" + } + ccs_matching_lookup = [] + for _, home in tqdm(ccs_coordination.iterrows(), total=len(ccs_coordination)): + + # Handle the case that has the wrong postcode in the asset data + if home["Name"] in ccs_manual_filters: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["survey_folder"] == ccs_manual_filters[home["Name"]] + ].copy() + else: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower() + ].copy() + + # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["survey_folder"]. + str.replace(r"[^\w\s]", ""). + str.replace(",", ""). + str.replace(".", ""). + str.contains( + home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False + ) + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").str.lower() == + home["Name"].lower() + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["Address"].str.replace(" ,", "").str.split(",").str[0:1].str.join("").str.lower() == + home["Name"].lower() + ) + if to_filter.sum() == 0: + # Do a fuzzy match on the name + # Find the best filter + to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").apply( + lambda x: fuzz.partial_ratio(home["Name"], x) > 93 + ) + if to_filter.sum() == 0: + # We also some cases where the name of the survey folder is like "Colville Road 7" and the + # property name is actually 7 Colville Road, so we try taking the final part of the address, + # splitting on space, and adding it to the front + def reformat_survey_folder(x): + filename = x.split("/")[-1] + parts = filename.split(" ") + return " ".join(parts[-1:] + parts[:-1]) + + to_filter = ( + filtered["survey_folder"].apply(lambda x: reformat_survey_folder(x)).str.lower() == + home["Name"].lower() + ) + + if to_filter.sum() == 0: + raise Exception("Error") + filtered = filtered[to_filter] + + if filtered.empty: + continue + + if filtered.shape[0] == 1: + ccs_matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Asset ID.1": home["Asset ID.1"], + "Name": home["Name"] + } + ) + continue + + raise Exception("No match") + + ccs_matching_lookup = pd.DataFrame(ccs_matching_lookup) + # We get a match for all records + assert ccs_matching_lookup.shape[0] == ccs_coordination.shape[0] + assert not pd.isnull(ccs_matching_lookup["Asset ID.1"]).sum() + assert not ccs_matching_lookup["Asset ID.1"].duplicated().sum() + + # We do the same for Wates + wates_coordination = wates_coordination.rename( + columns={"Post Code": "Postcode"} + ) + wates_coordination = wates_coordination[ + wates_coordination["Retrofit Assessment"].isin(["Completed"]) + ] + wates_coordination = wates_coordination[ + ~pd.isnull(wates_coordination["Postcode"]) + ] + + wates_manual_filters = { + "24 Rabley Wood View": f"{wave_21_folder_name}/3. Wiltshire/24-25 Rabley Wood View", + "14 Edencroft": f"{wave_21_folder_name}/3. Wiltshire/14 Edencroft", + "Flat 31 Rabley Wood View": f"{wave_21_folder_name}/3. Wiltshire/Flat 31 Rabley Wood View", + 'Flat 13, Manor Fields': f'{wave_21_folder_name}/1. Herefordshire/(038) Manor Fields Flat 13', + "4 Kittys Lane": f"{wave_21_folder_name}/1. Herefordshire/(005) Kittys Lane 4", + '1 Jephson Court': f'{wave_21_folder_name}/5. Coventry/Jesphson Court 1', + '2 Jephson Court': f'{wave_21_folder_name}/5. Coventry/Jesphson Court 2', + } + wates_matching_lookup = [] + # Examples to skip when we cannot get the data + wates_to_skip = [ + "66 Abbatt Close", # File type is unusual, couldn't extract the data + "Flat 69 Goddard Road", # Doesn't exist + "19 Garth House", # # File type is unusual, couldn't extract the data + '5 Gilpin Close', # No properly formatted EPR + '49 The Hide, Netherfield', # TODO: TEMP HERE + '19 Chanders Rd', + '5 Chanders Rd', + '23 Chanders Rd', + '3 Chanders Rd', + '1 Orchard Close', + ] + wates_coordination = wates_coordination[~wates_coordination["Name"].isin(wates_to_skip)] + + for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)): + + # Search the folder + filtered = retrofit_assessment_data[ + retrofit_assessment_data["survey_folder"].str.contains(home["folder_path"], regex=False) + ] + if len(filtered) == 1: + wates_matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Asset ID": home["Asset ID"], + "Name": home["Name"] + } + ) + continue + + if home["Name"] in wates_to_skip: + continue + + # Handle the case that has the wrong postcode in the asset data + if home["Name"] in wates_manual_filters: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["survey_folder"] == wates_manual_filters[home["Name"]] + ].copy() + else: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower() + ].copy() + + # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False + ) + + if to_filter.sum() > 1: + to_filter = ( + filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.split("/").str[-1].str.lower() == + home["Name"].replace(r"[^\w\s]", "").lstrip().lower() + ) + + if to_filter.sum() == 0: + to_filter = ( + filtered["survey_folder"]. + str.replace(r"[^\w\s]", ""). + str.replace(",", ""). + str.replace(".", ""). + str.contains( + home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False + ) + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").str.lower() == + home["Name"].lower() + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["Address"].str.replace(" ,", "").str.split(",").str[0:1].str.join("").str.lower() == + home["Name"].lower() + ) + if to_filter.sum() == 0: + # Do a fuzzy match on the name + # Find the best filter + to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").apply( + lambda x: fuzz.partial_ratio(home["Name"], x) > 93 + ) + if to_filter.sum() == 0: + # We also some cases where the name of the survey folder is like "Colville Road 7" and the + # property name is actually 7 Colville Road, so we try taking the final part of the address, + # splitting on space, and adding it to the front + def reformat_survey_folder(x): + filename = x.split("/")[-1] + parts = filename.split(" ") + return " ".join(parts[-1:] + parts[:-1]) + + to_filter = ( + filtered["survey_folder"].apply(lambda x: reformat_survey_folder(x)).str.lower() == + home["Name"].lower() + ) + + if to_filter.sum() == 0: + raise Exception("Error") + filtered = filtered[to_filter] + + if filtered.empty: + continue + + if filtered.shape[0] == 1: + wates_matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Asset ID": home["Asset ID"], + "Name": home["Name"] + } + ) + continue + + raise Exception("No match") + wates_matching_lookup = pd.DataFrame(wates_matching_lookup) + + # We get a match for all records + assert wates_matching_lookup.shape[0] == wates_coordination.shape[0] + assert not pd.isnull(wates_matching_lookup["Asset ID"]).sum() + assert not wates_matching_lookup["Asset ID"].duplicated().sum() + + # Merge lookup tables onto the coordination sheets + wates_coordination = wates_coordination.merge( + wates_matching_lookup, how="left", on="Name" + ) + missed_asset_id = wates_coordination[pd.isnull(wates_coordination["Asset ID_x"])] + if not missed_asset_id.empty: + raise Exception("Missing Asset ID") + + if wates_coordination["Asset ID_x"].duplicated().sum(): + raise Exception("Duplicated IDs in wates") + + # We merge the mpt data on to the wates coordination + wates_coordination = wates_coordination.merge( + mtp_df, how="left", on="survey_folder" + ) + + ccs_coordination = ccs_coordination.merge( + ccs_matching_lookup, how="left", on="Name" + ) + ccs_coordination = ccs_coordination.merge( + mtp_df, how="left", on="survey_folder" + ) + + retrofit_packages_board = retrofit_packages_board.merge( + matching_lookup, how="left", on="Name" + ) + + # We now map the retrofit assessment data to the coordinated packages + wates_coordination = wates_coordination.merge( + retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder" + ) + ccs_coordination = ccs_coordination.merge( + retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder" + ) + retrofit_packages_board = retrofit_packages_board.merge( + retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder" + ) + + # We have 4 properties in the Wates coordination board, that we want to remove from the retrofit packages board + to_remove = wates_coordination[ + wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"]) + ] + assert to_remove.shape[0] == 4 + # Remove them from the wates board + wates_coordination = wates_coordination[ + ~wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"]) + ] + + # We combine this into a singular board + coordinated_packages = pd.concat( + [ + retrofit_packages_board[ + [ + "Name", "Postcode", 'Actual SAP Band', 'Actual SAP Rating', + 'Modelled SAP Band', 'Modelled SAP Rating', 'Package Ref', + 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation', + 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', + 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', + 'Solar PV', 'Other measures', 'Organisation Reference', + ] + retrofit_assessments_data_columns_prefixed + ], + ccs_coordination[ + [ + # We don't have secondary wall insulation, Flat Roof, RIR, Heating Controls, + # Solar PV + "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package', + 'SAP Band Install Package', 'Package Approved (Client)', + 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', + 'Ventilation', 'Heating', 'Other Measures', 'PV System', + "Asset ID.1_y", + ] + retrofit_assessments_data_columns_prefixed + ].rename( + columns={ + "SAP Band Pre": "Actual SAP Band", + "SAP Rating Pre": "Actual SAP Rating", + 'SAP Rating Install Package': 'Modelled SAP Band', + 'SAP Band Install Package': 'Modelled SAP Rating', + 'Package Approved (Client)': 'Package Ref', + 'Wall Insulation': 'Main Wall Insulation', + 'Loft Insulation': 'Loft insulation', + 'Windows Upgrade': 'Window Upgrade', + 'Ext. Doors Upgrade': 'Door Upgrade', + 'Heating': 'Main Heating', + 'Other Measures': 'Other measures', + 'Asset ID.1_y': 'Organisation Reference', + "PV System": "Solar PV", + } + ), + wates_coordination[ + [ + "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package', + 'SAP Band Install Package', 'Package Approved (Client)', + 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', + 'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x', "PV System" + ] + retrofit_assessments_data_columns_prefixed + ].rename( + columns={ + "SAP Band Pre": "Actual SAP Band", + "SAP Rating Pre": "Actual SAP Rating", + 'SAP Rating Install Package': 'Modelled SAP Band', + 'SAP Band Install Package': 'Modelled SAP Rating', + 'Package Approved (Client)': 'Package Ref', + 'Wall Insulation': 'Main Wall Insulation', + 'Loft Insulation': 'Loft insulation', + 'Windows Upgrade': 'Window Upgrade', + 'Ext. Doors Upgrade': 'Door Upgrade', + 'Heating': 'Main Heating', + 'Other Measures': 'Other measures', + 'Asset ID_x': 'Organisation Reference', + "PV System": "Solar PV", + } + ) + ] + ) + + coordinated_packages["Organisation Reference"] = coordinated_packages["Organisation Reference"].astype(int) + assert not coordinated_packages["Organisation Reference"].duplicated().sum() + + # Merge the property features on + coordinated_packages = coordinated_packages.merge( + features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]], + how="left", + on="Organisation Reference" + ) + + coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Survey: Current EPC Band"])] + coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Survey: Current SAP Rating"])] + + # We need the features pertaining to these priority postcodes + + def find_nearest_matching_property(coordinated_packages, home): + filter_levels = [ + (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2), + (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3), + (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 4), + (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 5), + (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 6), + (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 7), + ] + + max_confidence = max([confidence for (_, confidence) in filter_levels]) + + for i, (filters, match_confidence) in enumerate(filter_levels): + match = coordinated_packages.copy() + + for col in filters: + match = match[match[col] == home[col]] + + if not match.empty: + return match, match_confidence + + # Finally, we search for a property in the same Archetype + match = coordinated_packages[coordinated_packages["Archetype ID"] == home["Archetype ID"]] + if not match.empty: + return match, max_confidence + 1 + + return None, None # No match found + + coordinated_packages["Postal Region"] = coordinated_packages["Postcode"].str.split(" ").str[0].str.strip() + new_priority_postcodes["Postal Region"] = new_priority_postcodes["Postcode"].str.split(" ").str[0].str.strip() + + coordinated_packages["Roof Simple"] = coordinated_packages["Roofs"].str.split(":").str[0].str.strip() + new_priority_postcodes["Roof Simple"] = new_priority_postcodes["Roofs"].str.split(":").str[0].str.strip() + + coordinated_packages["Primary Property Type"] = coordinated_packages["Property Type"].str.split(":").str[0] + new_priority_postcodes["Primary Property Type"] = new_priority_postcodes["Property Type"].str.split(":").str[0] + + coordinated_packages = coordinated_packages.merge( + new_priority_postcodes[["Organisation Reference", "Archetype ID"]], + how="left", + on="Organisation Reference" + ) + + # For every property in the priority postcodes data, we look for a most appropriate matching property + no_match = [] + matches = [] + for _, home in tqdm(new_priority_postcodes.iterrows(), total=len(new_priority_postcodes)): + # We check if the property was surveyed + survey_result = coordinated_packages[ + coordinated_packages["Organisation Reference"] == home["Organisation Reference"] + ] + if not survey_result.empty: + to_extend = [ + { + "Organisation Reference": home["Organisation Reference"], + "Best Match Organisation Reference": m, + "match_confidence": 1, + "Was Surveyed": True + } for m in survey_result["Organisation Reference"].values + ] + matches.extend(to_extend) + continue + + closest_match, match_confidence = find_nearest_matching_property(coordinated_packages, home) + if closest_match is None: + no_match.append(home["Organisation Reference"]) + continue + + to_extend = [ + { + "Organisation Reference": home["Organisation Reference"], + "Best Match Organisation Reference": m, + "match_confidence": match_confidence, + "Was Surveyed": False + } for m in closest_match["Organisation Reference"].values + ] + matches.extend(to_extend) + + no_match_summary = new_priority_postcodes[ + new_priority_postcodes["Organisation Reference"].isin( + no_match + ) + ].groupby(["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"])[ + "Organisation Reference"].count().reset_index() + + no_match_summary = no_match_summary.sort_values("Organisation Reference", ascending=False) + + # len(no_match) + # 8764, 5607, 5646, 5071 + # no_match_summary.shape + # (3953, 6), (2948, 6), (2969, 7), (2575, 7) + + matches_df = pd.DataFrame(matches) + + matches_df = matches_df.merge( + coordinated_packages[["Organisation Reference", "Survey: Current EPC Band", "Survey: Current SAP Rating"]], + left_on="Best Match Organisation Reference", right_on="Organisation Reference", + suffixes=("", " - Closest Match") + ) + + measures_columns = [ + 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation', + 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', + 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', + 'Solar PV', 'Other measures' + ] + + # We want to aggregate the matches, when we have multiple + aggregated_matches_df = [] + for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"): + + measures = coordinated_packages[ + ( + coordinated_packages["Organisation Reference"].isin( + mapped_matches['Best Match Organisation Reference'].values + ) + ) + ][measures_columns] + + if mapped_matches.shape[0] == 1: + # Get the measures for this property + measures = measures.squeeze() + + aggregated_matches_df.append( + { + "Organisation Reference": org_ref, + "Number of matches": 1, + "Proportion": 100, + "Estimated SAP Rating": mapped_matches["Survey: Current SAP Rating"].values[0], + "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0], + "Was Surveyed": mapped_matches["Was Surveyed"].values[0], + **measures + } + ) + continue + + # We need to aggregate the matches, since we have multiple + average_rating = mapped_matches["Survey: Current SAP Rating"].mean() + number_of_matches = mapped_matches.shape[0] + average_epc_rating = sap_to_epc(average_rating) + # proportion is the number of properties that have this EPC rating + proportion_with_this_epc = int( + mapped_matches[mapped_matches["Survey: Current EPC Band"] == average_epc_rating].shape[ + 0] / number_of_matches * 100 + ) + + measures_aggregated = {} + for m in measures_columns: + if any(~pd.isnull(measures[m])): + # Check if we have 2 unique values + vals = measures[~pd.isnull(measures[m])][m].unique() + if len(vals) > 1: + measures_aggregated[m] = ", ".join(vals) + else: + measures_aggregated[m] = vals[0] + + aggregated_matches_df.append( + { + "Organisation Reference": org_ref, + "Number of matches": number_of_matches, + "Proportion": proportion_with_this_epc, + "Estimated SAP Rating": average_rating, + "Estimated EPC Rating": average_epc_rating, + "Was Surveyed": False, + **measures_aggregated + } + ) + + aggregated_matches_df = pd.DataFrame(aggregated_matches_df) + + mapped_priority_list = new_priority_postcodes.merge( + aggregated_matches_df, on="Organisation Reference", how="left" + ) + + mapped_priority_list["address1"] = mapped_priority_list["Address"].str.split(",").str[0] + + # If we have a leading number like 01, 02, 03, 04, 05, 06, 07, 08, 09, we remove the leading 0 + + def remove_leading_zero(address): + return re.sub(r"^0([1-9]) ", r"\1 ", address) + + mapped_priority_list["address1"] = mapped_priority_list["address1"].apply(remove_leading_zero) + mapped_priority_list["address1"] = np.where( + mapped_priority_list["Organisation Reference"] == 37004, + "8 Mason Road", + mapped_priority_list["address1"] + ) + mapped_priority_list["address1"] = np.where( + mapped_priority_list["Organisation Reference"] == 37003, + "9 Mason Road", + mapped_priority_list["address1"] + ) + + mapped_priority_list = mapped_priority_list.rename( + columns={"UPRN": "uprn"} + ) + mapped_priority_list["row_id"] = mapped_priority_list["Organisation Reference"] + + # Flag where 2 out of the three columns have consensus + mapped_priority_list["2 of 3 Data Sources Have Consensus on EPC"] = ( + (mapped_priority_list["SAP Band"] == mapped_priority_list["EPC Band"]) | + (mapped_priority_list["SAP Band"] == mapped_priority_list["Estimated EPC Rating"]) | + (mapped_priority_list["EPC Band"] == mapped_priority_list["Estimated EPC Rating"]) + ) + + # Let's get the newest EPC data for these properties + # We merge on UPRN, when we have it + # from etl.route_march_data_pull.app import get_data + # epc_data, errors, nodata = get_data( + # asset_list=mapped_priority_list, + # fulladdress_column="Address", + # address1_column="address1", + # postcode_column="Postcode", + # manual_uprn_map={}, + # epc_api_only=True + # ) + # + # epc_df = pd.DataFrame(epc_data) + # epc_df.to_csv( + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "full_epc_data.csv"), index=False + # ) + epc_df = pd.read_csv(os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "full_epc_data.csv")) + epc_df = epc_df.rename(columns={"row_id": "Organisation Reference"}) + + # We now package up the data + + # Sheet 1 is the base coordination data + output_coordination_sheet = coordinated_packages[ + [ + "Name", "Postcode", 'Organisation Reference', 'Package Ref', + 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation', + 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', + 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', + 'Solar PV', 'Other measures', + 'Survey: Current SAP Rating', + 'Survey: Current EPC Band', + 'Survey: Primary Energy Use (kWh/yr)', + 'Survey: Primary Energy Use Intensity (kWh/m2/yr)', + 'Survey: Number of Storeys', 'Survey: Fuel Bill', + 'Survey: Window Age Description', + 'Survey: Window Age Description Proportion (%)', + 'Survey: Secondary Window Age Description', + 'Survey: Secondary Window Age Description Proportion (%)', + 'Survey: Number of Windows', 'Survey: Total Number of Doors', + 'Survey: Number of Insulated Doors', + 'Survey: Existing Primary Heating System', + 'Survey: Existing Primary Heating PCDF Reference', + 'Survey: Existing Primary Heating Controls', + 'Survey: Existing Primary Heating % of Heat', + 'Survey: Existing Secondary Heating System', + 'Survey: Existing Secondary Heating PCDF Reference', + 'Survey: Existing Secondary Heating Controls', + 'Survey: Existing Secondary Heating % of Heat', + 'Survey: Secondary Heating Code', 'Survey: Water Heating Code', + 'Survey: Total Floor Area (m2)', 'Survey: Total Ground Floor Area (m2)', + 'Survey: RIR Floor Area', 'Survey: Main Building Wall Area (m2)', + 'Survey: First Extension Wall Area (m2)', + 'Survey: Number of Light Fittings', 'Survey: Number of LEL Fittings', + 'Survey: Number of fittings needing LEL', 'Survey: Main Roof Type', + 'Survey: Main Roof Insulation', + 'Survey: Main Roof Insulation Thickness', 'Survey: Main Wall Type', + 'Survey: Main Wall Insulation', 'Survey: Main Wall Dry-lining', + 'Survey: Main Wall Thickness', + 'Survey: Main Building Alternative Wall Type', + 'Survey: Main Building Alternative Wall Insulation', + 'Survey: Main Building Alternative Wall Dry-lining', + 'Survey: Main Building Alternative Wall Thickness', + 'Survey: Main Fuel', + 'Survey: Main Building Age Band', + 'Walls', 'Roofs', 'Heating', 'Main Fuel', 'Age', 'Property Type' + ] + ].rename( + columns={ + 'Walls': "Parity - Walls", + 'Roofs': "Parity - Roof", + 'Heating': "Parity - Heating", + 'Main Fuel': "Parity - Fuel", + 'Age': "Parity - Age Band", + 'Property Type': "Parity - Property Type" + } + ) + + # Sheet 2 is the lookup table which maps the properties to their closest match + # We need to bring in the parity attributes between the mapped properties so we can see side-by-side + mapped_lookup = matches_df[ + [ + 'Organisation Reference', + 'Best Match Organisation Reference', + 'Survey: Current EPC Band', + 'Survey: Current SAP Rating', + "Was Surveyed", + "match_confidence", + ] + ].rename( + columns={ + 'Best Match Organisation Reference': "Best Match - Organisation Reference", + "Survey: Current EPC Band": "Best Match - Survey: Current EPC Band", + 'Survey: Current SAP Rating': "Best Match - Survey: Current SAp Rating" + } + ).merge( + features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type", + "Total Floor Area"]], + how="left", + on="Organisation Reference" + ).merge( + features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type", + "Total Floor Area"]].rename( + columns={ + "Organisation Reference": "Best Match - Organisation Reference", + "Walls": "Best Match - Walls", + "Roofs": "Best Match - Roof", + "Heating": "Best Match - Heating", + "Main Fuel": "Best Match - Main Fuel", + "Age": "Best Match - Age", + "Property Type": "Best Match - Property Type", + "Total Floor Area": "Best Match - Total Floor Area" + } + ), + how="left", + on="Best Match - Organisation Reference" + ).merge( + coordinated_packages[ + [ + "Organisation Reference", 'Survey: Main Wall Type', 'Survey: Main Wall Insulation', + 'Survey: Main Roof Type', 'Survey: Main Roof Insulation', 'Survey: Main Roof Insulation Thickness', + 'Survey: Existing Primary Heating System', 'Survey: Main Building Age Band', + 'Survey: Main Building Wall Area (m2)', 'Survey: Total Floor Area (m2)', + 'Survey: Main Building Age Band', + ] + ].rename( + columns={ + "Organisation Reference": "Best Match - Organisation Reference", + 'Survey: Main Wall Type': 'Best Match - Survey: Main Wall Type', + 'Survey: Main Wall Insulation': 'Best Match - Survey: Main Wall Insulation', + 'Survey: Main Roof Type': 'Best Match - Survey: Main Roof Type', + 'Survey: Main Roof Insulation': 'Best Match - Survey: Main Roof Insulation', + 'Survey: Main Roof Insulation Thickness': 'Best Match - Survey: Main Roof Insulation Thickness', + 'Survey: Existing Primary Heating System': 'Best Match - Survey: Existing Primary Heating System', + } + ), + how="left", + on="Best Match - Organisation Reference" + ) + + # Finally, we have the property, against the mapped home with the estimate SAP scores and the EPC data + worksheet = mapped_priority_list[ + [ + 'Organisation Reference', 'Address', 'Postcode', 'Address ID', 'uprn', 'Archetype ID', + 'SAP', 'SAP Band', "Property Type", "Walls", "Roofs", 'Glazing', + 'Heating', 'Main Fuel', 'Hot Water', 'Number of matches', 'Proportion', + 'Estimated SAP Rating', 'Estimated EPC Rating', "Was Surveyed", + 'Main Wall Insulation', + 'Secondary Wall Insulation', 'Loft insulation', 'Flat Roof', + 'Room in Roof', 'Window Upgrade', 'Door Upgrade', 'Ventilation', + 'Main Heating', 'Water Heating', 'Heating Controls', 'Solar PV', + 'Other measures', "2 of 3 Data Sources Have Consensus on EPC" + ] + ].rename( + columns={ + "SAP": "Parity - SAP Rating", + "SAP Band": "Parity - EPC Rating", + "Property Type": "Parity - Property Type", + "Walls": "Parity - Walls", + "Roofs": "Parity - Roofs", + 'Glazing': "Parity - Glazing", + 'Heating': 'Parity - Heating', + 'Main Fuel': 'Parity - Main Fuel', + 'Hot Water': 'Parity - Hot Water', + 'Proportion': 'Proportion of matched properties with same EPC rating', + } + ).merge( + epc_df[ + [ + "Organisation Reference", + "uprn", + "current-energy-efficiency", + "current-energy-rating", + "lodgement-date", + "construction-age-band", + "walls-description", + "roof-description", + "mainheat-description", + "windows-description", + "hotwater-description", + "main-fuel", + "total-floor-area", + ] + ].rename( + columns={ + "uprn": "Last EPC - uprn", + "current-energy-efficiency": "Last EPC - SAP Score", + "current-energy-rating": "Last EPC - EPC Rating", + "lodgement-date": "Last EPC - Date Lodged", + "construction-age-band": "Last EPC - Age Band", + "walls-description": "Last EPC - Walls", + "roof-description": "Last EPC - Roof", + "mainheat-description": "Last EPC - Heating", + "windows-description": "Last EPC - Windows", + "hotwater-description": "Last EPC - Hot Water", + "main-fuel": "Last EPC - Main Fuel", + "total-floor-area": "Last EPC - Total Floor Area" + } + ), + how="left", + on='Organisation Reference' + ) + + worksheet["Years Since Last EPC"] = pd.Timestamp.now().year - pd.to_datetime( + worksheet["Last EPC - Date Lodged"]).dt.year + + worksheet["Last EPC - uprn"] = worksheet["Last EPC - uprn"].astype("Int64").astype(str) + + worksheet["uprn"] = np.where( + pd.isnull(worksheet["uprn"]) & pd.notnull(worksheet["Last EPC - uprn"]), + worksheet["Last EPC - uprn"], + worksheet["uprn"] + ) + + worksheet["uprn"] = worksheet["uprn"].replace("", "") + + worksheet = worksheet.drop(columns=["Last EPC - uprn"]) + + # Save to Excel with multiple sheets + excel_path = os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "13022025 Stonewater Priority List.xlsx") + with pd.ExcelWriter(excel_path, engine="xlsxwriter") as writer: + worksheet.to_excel(writer, sheet_name="Worksheet", index=False, header=True) + mapped_lookup.to_excel(writer, sheet_name="Lookup Table", index=False, header=True) + output_coordination_sheet.to_excel(writer, sheet_name="Coordination", index=False, header=True) # if __name__ == "__main__": # main() diff --git a/etl/customers/stonewater/data_cleaning.py b/etl/customers/stonewater/data_cleaning.py index 8751960c..a5da0c79 100644 --- a/etl/customers/stonewater/data_cleaning.py +++ b/etl/customers/stonewater/data_cleaning.py @@ -1,6 +1,7 @@ import os import shutil from tqdm import tqdm +from etl.access_reporting.app import SharePointClient def delete_large_files(): @@ -66,13 +67,17 @@ def delete_large_files(): def download_data_from_sharepoint(): # Given a sharepoint location, this function will download the retrofit assessment folders from the locations # specified in the sharepoint location - from etl.access_reporting.app import SharePointClient + + SHAREPOINT_CLIENT_ID = os.getenv("SHAREPOINT_CLIENT_ID", None) + SHAREPOINT_CLIENT_SECRET = os.getenv("SHAREPOINT_CLIENT_SECRET", None) + SHAREPOINT_TENANT_ID = os.getenv("SHAREPOINT_TENANT_ID", None) + OSMOSIS_SHAREPOINT_SITE_ID = os.getenv("OSMOSIS_SHAREPOINT_SITE_ID", None) sharepoint_client = SharePointClient( - tenant_id="10d5af8b-2cfd-4882-9ccd-b96e4812dacf", - client_id="6832a4c5-fb8c-4082-a746-4f51e1020f0d", - client_secret="xpC8Q~Frww48SM1V-D8lGy5iOY7P_cJ7FF3jgarQ", - site_id="bc925a9a-ad0b-4de9-9a3c-e61014cc7489" + tenant_id=SHAREPOINT_TENANT_ID, + client_id=SHAREPOINT_CLIENT_ID, + client_secret=SHAREPOINT_CLIENT_SECRET, + site_id=OSMOSIS_SHAREPOINT_SITE_ID ) # Retrieve the data from Sharepoint and write to local machine @@ -81,9 +86,14 @@ def download_data_from_sharepoint(): folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" ) - len(contents["value"]) + folders_to_keep = [ + "1. Herefordshire", "2. Bedfordshire", "3. Wiltshire", "4. Bournemouth", + "5. Coventry", "6. West Sussex", "7. Dorset", "8. Cambridgeshire", + "9. Guildford", "10. Little Island", "11. CCS Dorset", + ] + folders_to_pull = [ - folder for folder in contents["value"] if folder["name"] in ["3. Wiltshire", "4. Bournemouth", "5. Coventry"] + folder for folder in contents["value"] if folder["name"] in folders_to_keep ] for folder_to_pull in folders_to_pull: # Get the contents @@ -103,35 +113,42 @@ def download_data_from_sharepoint(): folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" + "/" + folder_to_pull["name"] + "/" + property_folder["name"] ) - # We look for the retrofit assessment folder: + if not property_folder_contents.get("value"): + continue + # We look for the retrofit assessment folder or mtp folders: property_sub_folders = [ - f for f in property_folder_contents["value"] if "ra coordinator info" in f["name"].lower() + f for f in property_folder_contents["value"] if + "ra coordinator info" in f["name"].lower() or + "retrofit assessment" in f["name"].lower() or + "ra info" in f["name"].lower() or + "mtp" in f["name"].lower() or + "mid-term" in f["name"].lower() ] if not property_sub_folders: continue - # if we have this, we download the folder and store it on my laptop! - property_sub_folder = property_sub_folders[0] + for property_sub_folder in property_sub_folders: + # if we have this, we download the folder and store it on my laptop! - property_folder_path = os.path.join( - "Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders", - folder_to_pull["name"], - property_folder["name"], - property_sub_folder["name"] - ) + property_folder_path = os.path.join( + "Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders", + folder_to_pull["name"], + property_folder["name"], + property_sub_folder["name"] + ) - download_dir = os.path.join( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 2.1 Surveys", - folder_to_pull["name"], - property_folder["name"], - property_sub_folder["name"] - ) + download_dir = os.path.join( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 2.1 Surveys - 2", + folder_to_pull["name"], + property_folder["name"], + property_sub_folder["name"] + ) - # We download the folder - sharepoint_client.download_sharepoint_folder( - drive_id=sharepoint_client.document_drive["id"], - folder_path=property_folder_path, - download_dir=download_dir, - excluded_file_types=["MOV"] - ) + # We download the folder + sharepoint_client.download_sharepoint_folder( + drive_id=sharepoint_client.document_drive["id"], + folder_path=property_folder_path, + download_dir=download_dir, + excluded_file_types=["MOV", "jpg"] + ) diff --git a/etl/customers/stonewater/potential_eco_properties.py b/etl/customers/stonewater/potential_eco_properties.py index bda9c30c..6666ce15 100644 --- a/etl/customers/stonewater/potential_eco_properties.py +++ b/etl/customers/stonewater/potential_eco_properties.py @@ -217,78 +217,7 @@ def app(): ) ) - # We get the EPC data - # epc_data = json.loads( - # read_from_s3( - # bucket_name="retrofit-data-dev", - # s3_file_name="customers/Stonewater/clustering/epc_data.json" - # ) - # ) - # epc_data = pd.DataFrame(epc_data) - # - # epc_data["uprn"] = np.where( - # epc_data["internal_id"] == 1091, - # 83143766, - # epc_data["uprn"] - # ) - # - # epc_data_batch_2 = read_pickle_from_s3( - # s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl", - # bucket_name="retrofit-data-dev" - # ) - # epc_data_batch_2 = pd.DataFrame(epc_data_batch_2) - # - # complete_epcs = pd.concat([epc_data, epc_data_batch_2]) - # - # epcs_to_merge = complete_epcs[ - # [ - # "uprn", - # "address", - # "postcode", - # "property-type", - # "built-form", - # "inspection-date", - # "current-energy-rating", - # "current-energy-efficiency", - # "roof-description", - # "walls-description", - # "transaction-type", - # "secondheat-description", - # "total-floor-area", - # "construction-age-band", - # "floor-height", - # "number-habitable-rooms", - # "mainheat-description", - # "energy-consumption-current" - # ] - # ].rename( - # columns={ - # "address": "Address", - # "postcode": "Postcode", - # "inspection-date": "Date of last EPC", - # "current-energy-efficiency": "SAP score on register", - # "current-energy-rating": "EPC rating on register", - # "property-type": "Property Type", - # "built-form": "Archetype", - # "total-floor-area": "Property Floor Area", - # "construction-age-band": "Property Age Band", - # "floor-height": "Property Floor Height", - # "number-habitable-rooms": "Number of Habitable Rooms", - # "walls-description": "Wall Construction", - # "roof-description": "Roof Construction", - # "mainheat-description": "Heating Type", - # "secondheat-description": "Secondary Heating", - # "transaction-type": "Reason for last EPC", - # "energy-consumption-current": "Heat Demand (kWh/m2)", - # } - # ) - # # We de-dupe, taking the newest on the date the EPC was lod - # epcs_to_merge["Date of last EPC"] = pd.to_datetime(epcs_to_merge["Date of last EPC"]) - # epcs_to_merge = epcs_to_merge.sort_values("Date of last EPC", ascending=False) - # epcs_to_merge = epcs_to_merge.drop_duplicates(subset="uprn") - stonewater_cavity_properties["UPRN"] = stonewater_cavity_properties["UPRN"].astype("Int64").astype(str) - stonewater_cavity_properties["Reason Included"].value_counts() # Find the postcodes where an Osmosis survey revealed a need for CWI postcodes_found_needing_cwi = stonewater_cavity_properties[ stonewater_cavity_properties["Reason Included"].isin( @@ -339,12 +268,7 @@ def app(): "Renewables": "Parity - Renewables", "Total Floor Area": "Parity - Total Floor Area" } - ) # .merge( - # epcs_to_merge, - # how="left", - # left_on="UPRN", - # right_on="uprn" - # ) + ) # We now flag the additional properties in the as built list @@ -434,20 +358,20 @@ def app(): additional_properties["Suspected Needs CWI - not surveyed"] = ( ( - additional_properties["Postcode"].isin(postcodes_found_needing_cwi) + additional_properties["Postcode"].isin(postcodes_found_needing_cwi) & + ~additional_properties["Installed under ECO3"] ) ) - additional_properties["Same Postcode as Installed under ECO3"].value_counts() - # We drop Full Address additional_properties = additional_properties.drop(columns=["Full Address"]) additional_properties2 = additional_properties[[ "Address", "Postcode", "Address ID", "SAP", "SAP Band", "Property Type", "Walls", "Roofs", "Glazing", "Heating", "Main Fuel", "Hot Water", "Renewables", "Total Floor Area", 'Installed under ECO3', - 'Same Postcode as Installed under ECO3' + 'Same Postcode as Installed under ECO3', "Organisation Reference", ]].rename( columns={ + "Organisation Reference": "Org. ref.", "SAP": "Parity - Predicted SAP", "SAP Band": "Parity - Predicted SAP Band", "Age": "Parity - Build Age", @@ -461,65 +385,62 @@ def app(): "Renewables": "Parity - Renewables", "Total Floor Area": "Parity - Total Floor Area" } - ) # .merge( - # pd.DataFrame(additional_properties_epcs)[ - # [ - # "row_id", - # "property-type", - # "built-form", - # "inspection-date", - # "current-energy-rating", - # "current-energy-efficiency", - # "roof-description", - # "walls-description", - # "transaction-type", - # "secondheat-description", - # "total-floor-area", - # "construction-age-band", - # "floor-height", - # "number-habitable-rooms", - # "mainheat-description", - # "energy-consumption-current" - # ] - # ].rename( - # columns={ - # "inspection-date": "Date of last EPC", - # "current-energy-efficiency": "SAP score on register", - # "current-energy-rating": "EPC rating on register", - # "property-type": "Property Type", - # "built-form": "Archetype", - # "total-floor-area": "Property Floor Area", - # "construction-age-band": "Property Age Band", - # "floor-height": "Property Floor Height", - # "number-habitable-rooms": "Number of Habitable Rooms", - # "walls-description": "Wall Construction", - # "roof-description": "Roof Construction", - # "mainheat-description": "Heating Type", - # "secondheat-description": "Secondary Heating", - # "transaction-type": "Reason for last EPC", - # "energy-consumption-current": "Heat Demand (kWh/m2)", - # } - # ), - # how="left", - # on="row_id" - # ) + ) + + # Combine the data: + + stonewater_cavity_properties2 = stonewater_cavity_properties.merge( + features[["Address", "Organisation Reference"]], how="left", on="Organisation Reference" + ) + full_dataset = pd.concat([stonewater_cavity_properties2, additional_properties2]) + full_dataset = full_dataset.drop(columns=['Osm. ID']) + + # We not define the priority list for non-intrusives + full_dataset["Postal Region"] = full_dataset["Postcode"].str.split(" ").str[0].str[0:2] + full_dataset["Postal Region 2"] = full_dataset["Postcode"].str.split(" ").str[0] + + # Strip out anything we definitely don't want + full_dataset = full_dataset[~full_dataset["Installed under ECO3"]] + + areas = full_dataset[full_dataset["Suspected Needs CWI - not surveyed"] == True]["Postal Region 2"].unique() + + priorities = full_dataset[ + full_dataset["Postal Region 2"].isin(areas) + ] + + region_prevalance = priorities["Postal Region 2"].value_counts().to_frame().reset_index() + region_prevalance = region_prevalance[region_prevalance["count"] > 100] + df = priorities[priorities["Postal Region 2"].isin(region_prevalance["Postal Region 2"].values)] + + df["Postal Region"].value_counts() + df["Postal Region 2"].value_counts() + + if df["Installed under ECO3"].sum(): + raise ValueError("There are properties in the priority list that were installed under ECO3") + + df.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives - " + "revised list.csv", + index=False + ) # We save the data locally - stonewater_cavity_properties.to_csv( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties - priority " - "postcodes.csv", - index=False - ) - additional_properties2.to_csv( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties - " - "non-priority postcodes.csv", - index=False - ) - # Save the survey findings - needs_cwi.to_csv( - "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - WIP.csv", - index=False - ) + # stonewater_cavity_properties.to_csv( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties - priority " + # "postcodes.csv", + # index=False + # ) + # additional_properties2.to_csv( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties - " + # "non-priority postcodes.csv", + # index=False + # ) + # # Save the survey findings + # needs_cwi.to_csv( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - + # WIP.csv", + # index=False + # ) def cross_reference_epc_programme(): @@ -528,6 +449,12 @@ def cross_reference_epc_programme(): "SURVEYED - ECO3 NOT COMPLETED.xlsx" ) + for _, x in eco3_fallout.iterrows(): + house_no = SearchEpc.get_house_number(x["ADDRESS"], "") + if house_no is None: + house_no = x["ADDRESS"].split(",")[0] + x["house_number"] = house_no + eco3_fallout["house_number"] = eco3_fallout.apply( lambda x: SearchEpc.get_house_number(x["ADDRESS"], ""), axis=1 ) @@ -558,3 +485,58 @@ def cross_reference_epc_programme(): stonewater_modelled_above_c["Address"].apply(lambda x: fuzz.ratio(x, property["ADDRESS"]) > 90) ] match.head() + + +def finalise_list_for_non_intrusives(): + non_intrusives_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/20250207 Stonewater " + "Non-Intrusives.xlsx" + ) + + # Remove anything installed under ECO3 + non_intrusives_list = non_intrusives_list[~non_intrusives_list["Installed under ECO3"]] + + # We make any properties that were surveyed by Osmosis + packages = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/Stonewater - Bid Packages WIP 14.11.20 V2 " + "(1).xlsx", + header=13, + sheet_name="Modelled Packages" + ) + + non_intrusives_list["Surveyed by Osmosis"] = non_intrusives_list["Address ID"].isin( + packages["Address ID"].values + ) + # Removed 54 addresses + final_non_intrusives = non_intrusives_list[ + ~non_intrusives_list["Surveyed by Osmosis"] + ] + + features = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - " + "master sheet.csv", + encoding='latin1' + ) + + # Add on the orgnisaion reference + final_non_intrusives = final_non_intrusives.merge( + features[["Organisation Reference", "Address ID"]], + how="left", + on="Address ID" + ) + + final_non_intrusives["Postal Region"] = final_non_intrusives["Postcode"].str.split(" ").str[0].str[0:2] + selected_regions = final_non_intrusives[ + final_non_intrusives["Include in non-intrusives"] + ]["Postcode"].unique() + + final_non_intrusives["Is in region"] = final_non_intrusives["Postcode"].isin(selected_regions) + + # Filter down: + final_non_intrusives = final_non_intrusives[ + final_non_intrusives["Is in region"] + ] + + final_non_intrusives.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives " + "List - final.xlsx") diff --git a/etl/find_my_epc/AssetListEpcData.py b/etl/find_my_epc/AssetListEpcData.py index bce8cd1f..1d2e1472 100644 --- a/etl/find_my_epc/AssetListEpcData.py +++ b/etl/find_my_epc/AssetListEpcData.py @@ -72,12 +72,20 @@ class AssetListEpcData: epc_searcher.find_property(skip_os=True) if epc_searcher.newest_epc is None: continue - - find_epc_searcher = RetrieveFindMyEpc( - address=epc_searcher.newest_epc["address1"], - postcode=epc_searcher.newest_epc["postcode"] - ) - find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + # Attempt both methods: + try: + find_epc_searcher = RetrieveFindMyEpc( + address=epc_searcher.newest_epc["address1"] + ", " + epc_searcher.newest_epc["address2"], + postcode=epc_searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + except Exception as e: + logger.error(f"Error retrieving find my epc data: {e}") + find_epc_searcher = RetrieveFindMyEpc( + address=epc_searcher.newest_epc["address1"], + postcode=epc_searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() time.sleep(0.5) # We need uprn diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py index f93a5a73..9852cc0d 100644 --- a/etl/find_my_epc/RetrieveFindMyEpc.py +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -25,6 +25,7 @@ class RetrieveFindMyEpc: self.postcode = postcode self.address_cleaned = self.address.replace(",", "").replace(" ", "").lower() + self.walls = [] @staticmethod def extract_low_carbon_sources(soup): @@ -102,6 +103,8 @@ class RetrieveFindMyEpc: # 2) Bills estimates # 3) Recommendations and SAP points # 4) Low and zero carbon energy sources + # 5) The wall types of the property - used for determining if we have an extension wall insulation# + # recommendation ratings = address_res.find('desc', {'id': 'svg-desc'}).text current_rating = ratings.split(".")[0] @@ -208,6 +211,17 @@ class RetrieveFindMyEpc: if key not in assessment_data: raise ValueError(f"Missing key: {key}") + # The wall types of the property + property_features_table = address_res.find("tbody", class_="govuk-table__body") + property_features_table = property_features_table.find_all("tr") + + # Extract wall types + self.walls = [] + for row in property_features_table: + cells = row.find_all("td") + if row.find("th").text.strip() == "Wall": + self.walls.append(cells[0].text.strip()) + # Finally, we format the recommendations recommendations = self.format_recommendations(recommendations, assessment_data, sap_2012_date) @@ -229,8 +243,7 @@ class RetrieveFindMyEpc: return resulting_data - @staticmethod - def format_recommendations(recommendations, assessment_data, sap_2012_date=None): + def format_recommendations(self, recommendations, assessment_data, sap_2012_date=None): """ This function converts the recommendations to a format that we can use in the engine as a non-intrusive survey :param recommendations: The recommendations from the EPC @@ -317,7 +330,8 @@ class RetrieveFindMyEpc: "roomstat_programmer_trvs", "time_temperature_zone_control" ], "Replacement warm air unit": [], - "Secondary glazing": ["secondary_glazing"] + "Secondary glazing": ["secondary_glazing"], + "Condensing heating unit": ["boiler_upgrade"], } survey = True @@ -330,6 +344,8 @@ class RetrieveFindMyEpc: for rec in recommendations: mapped = measure_map[rec["measure"]] for measure in mapped: + if measure == "cavity_wall_insulation" and "solid brick" in self.walls[0].lower(): + measure = "extension_cavity_wall_insulation" to_append = { "type": measure, "sap_points": rec["sap_points"], diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py deleted file mode 100644 index 8d19aa84..00000000 --- a/etl/route_march_data_pull/app.py +++ /dev/null @@ -1,396 +0,0 @@ -import os -import time - -import pandas as pd -import numpy as np -from tqdm import tqdm - -from dotenv import load_dotenv -from backend.SearchEpc import SearchEpc -from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc -from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes - -from recommendations.recommendation_utils import ( - estimate_perimeter, - estimate_external_wall_area, - estimate_number_of_floors -) - -load_dotenv(dotenv_path="backend/.env") -EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") - - -def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map): - epc_data = [] - errors = [] - no_epc = [] - for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): - try: - postcode = home[postcode_column] - house_number = home[address1_column].strip() - full_address = home[fulladdress_column].strip() - house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode) - if house_no is None: - house_no = house_number - uprn = manual_uprn_map.get(full_address, None) - - searcher = SearchEpc( - address1=str(house_no), - postcode=postcode, - auth_token=EPC_AUTH_TOKEN, - os_api_key="", - property_type=None, - fast=True, - full_address=full_address, - max_retries=5, - uprn=uprn - ) - # Force the skipping of estimating the EPC - searcher.ordnance_survey_client.property_type = None - searcher.ordnance_survey_client.built_form = None - - searcher.find_property(skip_os=True) - - # Check if we have a flat or appartment - if searcher.newest_epc is None and uprn is None: - # Try again: - if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None: - # Backup - add1 = full_address.split(",") - if len(add1) > 1: - add1 = add1[1].strip() - else: - # Try splitting on space - add1 = full_address.split(" ")[0].strip() - - else: - add1 = str(house_number) - searcher = SearchEpc( - address1=add1, - postcode=postcode, - auth_token=EPC_AUTH_TOKEN, - os_api_key="", - property_type=None, - fast=True, - full_address=full_address, - max_retries=5 - ) - - if ( - "flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in - house_number.lower() - ): - searcher.ordnance_survey_client.property_type = "Flat" - - searcher.find_property(skip_os=True) - - if searcher.newest_epc is None: - no_epc.append(home["row_id"]) - continue - - # Look for EPC recommendatons - try: - property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) - except: - property_recommendations = {"rows": []} - - # Retrieve data from FindMyEPC - try: - find_epc_searcher = RetrieveFindMyEpc( - address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] - ) - find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() - except ValueError as e: - if "No EPC found" in str(e) and "address1" in searcher.newest_epc: - find_epc_searcher = RetrieveFindMyEpc( - address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"] - ) - find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() - else: - find_epc_data = {} - except Exception as e: - raise Exception(f"Error retrieving FindMyEPC data: {e}") - time.sleep(np.random.uniform(0.1, 1)) - - epc = { - "row_id": home["row_id"], - **searcher.newest_epc.copy(), - "recommendations": property_recommendations["rows"], - "find_my_epc_data": find_epc_data, - } - - epc_data.append(epc) - except Exception as e: - errors.append(home["row_id"]) - time.sleep(5) - - return epc_data, errors, no_epc - - -def extract_address1(asset_list, full_address_col, method="first_two_words"): - if method == "first_two_words": - asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") - return asset_list - - if method == "first_word": - asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0] - return asset_list - - raise ValueError(f"Method {method} not recognized") - - -def app(): - """ - This app is EPC pulling data for some properties owned by Livewest - - Data request contents: - Date of last EPC - Reason for EPC - SAP score on register - Property Type - Property Area - Property Age - Any Dimensions (HLP,PW,RH) - Property Wall Construction - Heating Type - Secondary Heating - Loft Insulation Depth - - Additional if possible: - Heat loss calculations - EPC recommendations - Property UPRN - - """ - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern" - DATA_FILENAME = "January 2025 Additions Query.xlsx" - SHEET_NAME = "Jan 2025 additions" - POSTCODE_COLUMN = "Post Code" - FULLADDRESS_COLUMN = "Street / Block Name" - ADDRESS1_COLUMN = None - ADDRESS1_METHOD = "first_word" - ADDRESS_COLS_TO_CONCAT = [] - - # Maps addresses to uprn in problematic cases - MANUAL_UPRN_MAP = { - "Ardelagh Ardelagh Faris Lane Woodham Addlestone KT15 3DJ": 100061484560 - } - - asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) - asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index() - asset_list["row_id"] = asset_list.index - - # We clean up portential non-breaking spaces, and double spaces - for col in [c for c in [POSTCODE_COLUMN, FULLADDRESS_COLUMN, ADDRESS1_COLUMN] if c is not None]: - asset_list[col] = asset_list[col].astype(str) - asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False) - asset_list[col] = asset_list[col].str.replace(' ', ' ', regex=False) - - if ADDRESS1_COLUMN is None: - ADDRESS1_COLUMN = "address1_extracted" - asset_list = extract_address1( - asset_list=asset_list, full_address_col=FULLADDRESS_COLUMN, method=ADDRESS1_METHOD - ) - - if FULLADDRESS_COLUMN is None: - FULLADDRESS_COLUMN = "fulladdress_extracted" - # We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas - asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(lambda x: ", ".join(x), axis=1) - - # We check for duplicated addresses - asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN] - if asset_list["deduper"].duplicated().sum(): - # Drop the dupes - print(f"There are {asset_list['deduper'].duplicated().sum()} duplicated addresses - dropping") - asset_list = asset_list[~asset_list["deduper"].duplicated()] - asset_list = asset_list.drop(columns=["deduper"]) - - epc_data, errors, no_epc = get_data( - asset_list=asset_list, - fulladdress_column=FULLADDRESS_COLUMN, - address1_column=ADDRESS1_COLUMN, - postcode_column=POSTCODE_COLUMN, - manual_uprn_map=MANUAL_UPRN_MAP - ) - - # We now retrieve any failed properties - asset_list_failed = asset_list[asset_list["row_id"].isin(errors)] - epc_data_failed, _, _ = get_data( - asset_list=asset_list_failed, - fulladdress_column=FULLADDRESS_COLUMN, - address1_column=ADDRESS1_COLUMN, - postcode_column=POSTCODE_COLUMN, - manual_uprn_map=MANUAL_UPRN_MAP - ) - - no_data = asset_list[asset_list["row_id"].isin(no_epc)] - print(no_data[[FULLADDRESS_COLUMN, POSTCODE_COLUMN]]) - - # Append the failed data to the main data - epc_data.extend(epc_data_failed) - - epc_df = pd.DataFrame(epc_data) - - # We expand out the recommendations - recommendations_df = epc_df[["row_id", "recommendations"]] - - unique_recommendations = set() - for _, row in recommendations_df.iterrows(): - unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) - - columns = ["row_id"] + list(unique_recommendations) - transformed_data = [] - for _, row in recommendations_df.iterrows(): - # Initialize a dictionary for this row with False for all recommendations - row_data = {col: False for col in columns} - row_data["row_id"] = row["row_id"] - - # Set True for each recommendation present in this row - for rec in row["recommendations"]: - recommendation_text = rec["improvement-summary-text"] - row_data[recommendation_text] = True - - # Append the row data to transformed_data - transformed_data.append(row_data) - - transformed_df = pd.DataFrame(transformed_data) - # Drop the column that is "" - if "" in transformed_df.columns: - transformed_df = transformed_df.drop(columns=[""]) - - # Get the find my epc data - find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join( - pd.json_normalize(epc_df["find_my_epc_data"]) - ) - # We check if we get the solar pv column: - if "Solar photovoltaics" not in find_my_epc_data.columns: - find_my_epc_data["Solar photovoltaics"] = False - - # Retrieve just the data we need - epc_df = epc_df[ - [ - "row_id", - "uprn", - "address1", - "address", - "postcode", - "property-type", - "built-form", - "inspection-date", - "current-energy-rating", - "current-energy-efficiency", - "roof-description", - "walls-description", - "floor-description", - "transaction-type", - # New fields needed - "secondheat-description", - "total-floor-area", - "construction-age-band", - "floor-height", - "number-habitable-rooms", - "mainheat-description", - # - "energy-consumption-current", # kwh/m2 - "photo-supply", - ] - ].rename(columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"}) - - asset_list = asset_list.merge( - epc_df, - how="left", - on="row_id" - ).merge( - find_my_epc_data[ - [ - "row_id", "heating_text", "hot_water_text", 'Assessor’s name', - "Assessor's Telephone", "Assessor's Email", "Accreditation scheme", - "Assessor’s ID", "Solar photovoltaics" - ] - ].rename( - columns={ - "Solar photovoltaics": "Has Solar PV", - "heating_text": "Heating Estimated kWh", - "hot_water_text": "Hot Water Estimated kWh", - } - ), - how="left", - on="row_id" - ) - - asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""]) - asset_list = asset_list.drop(columns=["photo-supply"]) - - # Rename the columns - asset_list = asset_list.rename(columns={ - "inspection-date": "Date of last EPC", - "current-energy-efficiency": "SAP score on register", - "current-energy-rating": "EPC rating on register", - "property-type": "Property Type", - "built-form": "Archetype", - "total-floor-area": "Property Floor Area", - "construction-age-band": "Property Age Band", - "floor-height": "Property Floor Height", - "number-habitable-rooms": "Number of Habitable Rooms", - "walls-description": "Wall Construction", - "roof-description": "Roof Construction", - "floor-description": "Floor Construction", - "mainheat-description": "Heating Type", - "secondheat-description": "Secondary Heating", - "transaction-type": "Reason for last EPC", - "energy-consumption-current": "Heat Demand (kWh/m2)", - }) - - asset_list["Estimated Number of Floors"] = asset_list.apply( - lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull( - x["Property Type"]) else None, axis=1 - ) - - asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float) - # Replace "" value with None - asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None) - asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float) - - asset_list["Estimated Perimeter (m)"] = asset_list.apply( - lambda x: estimate_perimeter( - floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"], - num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"], - ), axis=1 - ) - - asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply( - lambda x: estimate_external_wall_area( - num_floors=x["Estimated Number of Floors"], - floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, - perimeter=x["Estimated Perimeter (m)"], - built_form=x["Archetype"] - ), - axis=1 - ) - - asset_list["Roof Insulation Thickness"] = asset_list.apply( - lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull( - x["Roof Construction"]) else None, - axis=1 - ) - - # For all of the columns in transformed_df, prefix with "Recommendation: " - for col in transformed_df.columns: - if col == "row_id": - continue - transformed_df = transformed_df.rename(columns={col: f"Recommendation: {col}"}) - - asset_list = asset_list.merge( - transformed_df, - how="left", - on="row_id" - ) - asset_list = asset_list.drop(columns=["row_id", "index"]) - - # Store as an excel - filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull - Main.xlsx" - asset_list.to_excel(filename, index=False) - - matches_review = asset_list[ - [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"] - ] diff --git a/recommendations/HeatingRecommender.py b/recommendations/HeatingRecommender.py index c5c07f89..e4dd3a78 100644 --- a/recommendations/HeatingRecommender.py +++ b/recommendations/HeatingRecommender.py @@ -852,6 +852,8 @@ class HeatingRecommender: else: heating_simulation_config["mainheat_energy_eff_ending"] = self.property.data["mainheat-energy-eff"] + # TODO:We possibly shouldn't touch the hot water energy efficiency if we aren't recommending dual immersion + # we'll keep this for the moment though if self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor"]: heating_simulation_config["hot_water_energy_eff_ending"] = "Average" else: @@ -993,7 +995,7 @@ class HeatingRecommender: # We check if there's a mains connection and the hot water is inefficient, as this will improve with a boiler has_inefficient_water = ( self.property.data["mains-gas-flag"] and - self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor", "Average"] + self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor"] ) non_invasive_recommendation = next(( diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index 15614a0b..715332a5 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -503,7 +503,9 @@ class Recommendations: impact_summary.append( { "phase": rec["phase"], + "representative": rec["recommendation_id"] in representative_ids, "recommendation_id": rec["recommendation_id"], + "measure_type": rec["measure_type"], "sap": sap + rec["sap_points"], "carbon": carbon - rec["co2_equivalent_savings"], "heat_demand": heat_demand - rec["heat_demand"], @@ -621,6 +623,13 @@ class Recommendations: if li_sap_limit is not None: property_phase_impact["sap"] = min(property_phase_impact["sap"], li_sap_limit) + if rec["type"] == "solar_pv": + # We use the SAP points in the recommendation as a minimum + property_phase_impact["sap"] = ( + rec["sap_points"] if property_phase_impact["sap"] < rec["sap_points"] else + property_phase_impact["sap"] + ) + # Insert this information into the recommendation. if not rec.get("survey", False): rec["sap_points"] = property_phase_impact["sap"] @@ -647,7 +656,9 @@ class Recommendations: return property_recommendations, impact_summary @staticmethod - def map_descriptions_to_fuel(heating_description, hotwater_description, main_fuel_description): + def map_descriptions_to_fuel( + heating_description, hotwater_description, main_fuel_description, descriptions_to_fuel_types + ): # Handle the case of community schemes if (heating_description == "Community scheme") or (hotwater_description == "Community scheme"): @@ -660,7 +671,7 @@ class Recommendations: } raise NotImplementedError("Handle this case") - mapped = assumptions.DESCRIPTIONS_TO_FUEL_TYPES[heating_description] + mapped = descriptions_to_fuel_types[heating_description] heating_fuel = mapped["fuel"] if hotwater_description in [ @@ -680,7 +691,7 @@ class Recommendations: "heating_cop": mapped["cop"], "hotwater_cop": 1 } - mapped_hotwater = assumptions.DESCRIPTIONS_TO_FUEL_TYPES[hotwater_description] + mapped_hotwater = descriptions_to_fuel_types[hotwater_description] return { "heating_fuel_type": heating_fuel, "hotwater_fuel_type": mapped_hotwater["fuel"], @@ -689,7 +700,7 @@ class Recommendations: @classmethod def calculate_recommendation_tenant_savings( - cls, property_instance, kwh_simulation_predictions, property_recommendations + cls, property_instance, kwh_simulation_predictions, property_recommendations, ashp_cop=None ): """ This method inserts the kwh savings and the bill savings that the customer will make from the recommendations @@ -701,9 +712,12 @@ class Recommendations: :param property_instance: Instance of the Property class, for the home associated to property_id :param kwh_simulation_predictions: dictionary of predictions from the model apis :param property_recommendations: dictionary of recommendations for the property + :param ashp_cop: The coefficient of performance for the air source heat pump. :return: """ + ashp_cop = ashp_cop if ashp_cop else assumptions.AVERAGE_ASHP_EFFICIENCY + kwh_impact_table = kwh_simulation_predictions["heating_kwh_predictions"][ kwh_simulation_predictions["heating_kwh_predictions"]["property_id"] == str(property_instance.id) ].merge( @@ -772,12 +786,19 @@ class Recommendations: if kwh_impact_table.loc[i, col] > previous_phase[col].max(): kwh_impact_table.loc[i, col] = previous_phase[col].max() + descriptions_to_fuel_types = assumptions.DESCRIPTIONS_TO_FUEL_TYPES + # We will the air source heat pump efficiencies + ashp_keys = [k for k in descriptions_to_fuel_types.keys() if "air source heat pump" in k.lower()] + for k in ashp_keys: + descriptions_to_fuel_types[k]["cop"] = ashp_cop + # For heating system recommendations, this could result in a fuel type change so we reflect that fuel_mapping = pd.DataFrame([ { "id": epc["id"], **cls.map_descriptions_to_fuel( - epc["mainheat-description"], epc["hotwater-description"], epc["main-fuel"] + epc["mainheat-description"], epc["hotwater-description"], epc["main-fuel"], + descriptions_to_fuel_types ) } for epc in property_instance.updated_simulation_epcs ]) @@ -791,7 +812,8 @@ class Recommendations: **cls.map_descriptions_to_fuel( property_instance.data["mainheat-description"], property_instance.data["hotwater-description"], - property_instance.data["main-fuel"] + property_instance.data["main-fuel"], + descriptions_to_fuel_types ) } ] diff --git a/recommendations/SolarPvRecommendations.py b/recommendations/SolarPvRecommendations.py index 95f189d3..a97dbcb3 100644 --- a/recommendations/SolarPvRecommendations.py +++ b/recommendations/SolarPvRecommendations.py @@ -14,11 +14,16 @@ class SolarPvRecommendations: # This was previously set to 250w, but has been upped to 400 based on the systems used by Cotswolrd Energy Group SOLAR_PANEL_WATTAGE = 400 + # For domestic properties, we don't recommend a solar PV system with wattage outside of these + # bounds MAX_SYSTEM_WATTAGE = 6000 MIN_SYSTEM_WATTAGE = 1000 + # the maximum area of root we allow to be covered in solar panels for our recommendations. MAX_ROOF_AREA_PERCENTAGE = 0.7 + SAP_POINTS_PER_5_PERCENT_ROOF_COVERAGE = 1 + def __init__(self, property_instance): """ :param property_instance: Instance of the Property class, for the home associated to property_id @@ -212,6 +217,20 @@ class SolarPvRecommendations: roof_coverage_percent = round(recommendation_config["panneled_roof_area"] / roof_area * 100) # We round up to the nearest 5 roof_coverage_percent = np.ceil(roof_coverage_percent / 5) * 5 + + # Typically, we've observed that every 5% of additional roof coverage will result in at least + # an additional 1 SAP points (though often 2 points) Given this, we can add a reasonable minimum + # for the number of SAP points we might expect. We've observed that for some cases where properties + # are hitting the higher SAP scores (e.g. EPC A and above), the model can sometimes under-predict + # the number of SAP points. This appears to be due to a relatively small number of properties + # actually achieving the upper echelons of EPC rating. This can be the case if we're simulating a + # whole house retrofit where the home is getting complete insulation, a heat pump and solar panels. + # Because panels are the final recommendation, they are often the measure that takes the home + # into the medium to high EPC A ranges and so because of a lack of training data, this means that + # we might sometime under-predict. This minimum is intended to try and reduce the negative impact + # of this. This minimum is used in Recommendations.calculate_recommendation_impact + minimum_sap_points = (roof_coverage_percent / 5) * self.SAP_POINTS_PER_5_PERCENT_ROOF_COVERAGE + for has_battery in [False, True]: cost_result = self.costs.solar_pv( has_battery=has_battery, @@ -240,7 +259,7 @@ class SolarPvRecommendations: "description": description, "starting_u_value": None, "new_u_value": None, - "sap_points": None, + "sap_points": minimum_sap_points, "already_installed": already_installed, **cost_result, # This is required for simulating the SAP impact. solar_pv_percentage is between 0 & 1 so we diff --git a/recommendations/WindowsRecommendations.py b/recommendations/WindowsRecommendations.py index 1f755369..46e56c93 100644 --- a/recommendations/WindowsRecommendations.py +++ b/recommendations/WindowsRecommendations.py @@ -215,21 +215,29 @@ class WindowsRecommendations: "glazed-type": glazed_type_ending, } + measure_type = "double_glazing" if not is_secondary_glazing else "secondary_glazing" + + non_invasive_recommendation = next( + (r for r in self.property.non_invasive_recommendations if r["type"] in ["windows_glazing", measure_type]), + {} + ) + self.recommendation = [ { "phase": phase, "parts": [], "type": "windows_glazing", - "measure_type": "double_glazing" if not is_secondary_glazing else "secondary_glazing", + "measure_type": measure_type, "description": description, "starting_u_value": None, "new_u_value": None, - "sap_points": None, + "sap_points": non_invasive_recommendation.get("sap_points", None), "already_installed": already_installed, **cost_result, "is_secondary_glazing": is_secondary_glazing, "description_simulation": description_simulation, "simulation_config": simulation_config, + "survey": non_invasive_recommendation.get("survey", None), } ] diff --git a/recommendations/recommendation_utils.py b/recommendations/recommendation_utils.py index 00da6107..602684cf 100644 --- a/recommendations/recommendation_utils.py +++ b/recommendations/recommendation_utils.py @@ -205,7 +205,7 @@ def get_wall_u_value( mapped_value = wall_uvalues_df[ wall_uvalues_df["Wall_type"] == mapped_description - ][age_band].values[0] + ][age_band].values[0] if pd.isnull(mapped_value) and "Park home" in mapped_description: # We don't know enough in this case so we default to 0 @@ -428,6 +428,9 @@ def estimate_number_of_floors(property_type): Using the property type, we estimate the number of floors in the property """ + if property_type is None: + return None + if property_type == "House": number_of_floors = 2 elif property_type in ["Flat", "Bungalow"]: @@ -560,7 +563,7 @@ def get_floor_u_value( insulation_lookup = s11[ s11["Age_band"].str.contains(age_band) & s11["Floor_construction"] == floor_type - ] + ] if insulation_lookup.empty: insulation_thickness = 0 else: diff --git a/survey_report/app.py b/survey_report/app.py new file mode 100644 index 00000000..f6eddb8d --- /dev/null +++ b/survey_report/app.py @@ -0,0 +1,270 @@ +import os +import requests +import PyPDF2 +from string import Template + +import pandas as pd + +from survey_report.extraction.detect_report_type import detect_report_type +from survey_report.extraction.quidos import SiteNotesExtractor, EPRExtractor + + +def generate_html_report(template_path, output_path, data): + """ + Reads an HTML template file, injects dynamic values, and generates a final HTML report. + + Args: + - template_path (str): Path to the HTML template file. + - output_path (str): Path to save the generated HTML file. + - data (dict): Dictionary containing dynamic values for the report. + """ + # Read the template file + with open(template_path, "r", encoding="utf-8") as f: + html_template = Template(f.read()) # Use Template from string module + + # Replace placeholders with actual data + final_html = html_template.safe_substitute(data) # Use safe_substitute to prevent missing key errors + + # Save the generated HTML file + with open(output_path, "w", encoding="utf-8") as f: + f.write(final_html) + + print(f"HTML report generated successfully: {output_path}") + + +def stringify_number(num: int, rounding: bool = True) -> str: + if num < 100000: # 5 figures or fewer + rounded_num = ((num + 99) // 100) * 100 if rounding else num + return f"{rounded_num:,}" + else: # More than 5 figures + rounded_num = ((num + 999) // 1000) * 1000 if rounding else num + return f"{rounded_num // 1000}k" + + +class PlacidApi: + # Errors as defined by docs: https://placid.app/docs/2.0/rest/errors + ERROR_CODES = { + 400: "Bad request", + 401: "Unauthorized", + 404: "Template Not found", + 422: "Validation error", + 429: "Rate limit exceeded", + 500: "Internal server error", + } + + def __init__(self, api_key): + self.api_key = api_key + + self.headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + "Accept": "application/json", + } + + def create_pdf( + self, + template_uuid: str, + current_epc_rating: str, + current_epc_rating_colour: str, + post_retrofit_epc_rating: str, + post_retrofit_epc_rating_colour: str, + ): + url = "https://api.placid.app/api/rest/pdfs" + + body = { + "webhook_success": None, + "passthrough": None, + "pages": [ + { + "template_uuid": template_uuid, + "layers": { + "current_epc_rating": { + "text": current_epc_rating, + "text_color": current_epc_rating_colour, + }, + "post_retrofit_epc_rating": { + "text": post_retrofit_epc_rating, + "text_color": post_retrofit_epc_rating_colour, + } + }, + }, + ] + } + + response = requests.post( + url, + headers=self.headers, + json=body + ) + + response_body = response.json() + + return response_body + + def get_pdf(self, pdf_id: str): + """ + Poll the API every 5 seconds until the PDF is ready + """ + url = f"https://api.placid.app/api/rest/pdfs/{pdf_id}" + + response = requests.get( + url, + headers=self.headers + ) + response_body = response.json() + + url = response_body["pdf_url"] + # Download the PDF form this uurl + pdf_download = requests.get(url) + with open("survey_report/example_data/output.pdf", "wb") as f: + f.write(pdf_download.content) + + +def handler(): + """ + Performs the data extraction process for the survey report + :return: + """ + + PLACID_API_KEY = "placid-mpkwidzer2mens9h-hifa3dmbxpfeghpa" + TEMPLATE_UUID = "5bst9mh1q9lk9" + placid_api = PlacidApi(PLACID_API_KEY) + + current_property_value = 250000 # Needs to be an input + + EPC_COLOURS = { + "A": "#117d58", + "B": "#2da55c", + "C": "#8dbd40", + "D": "#f7cd14", + "E": "#f3a96a", + "F": "#ef8026", + "G": "#e41e3b", + } + + folders = [ + { + "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 " + "WILLIS ROAD FLAT 1 PRE EPR SITE NOTES.pdf", + "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 WILLIS " + "ROAD FLAT 1 PRE EPR PDF.pdf", + "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data" + "/Flat 1/3 WILLIS ROAD FLAT 1 POST EPR SITE NOTES.pdf" + }, + { + "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 " + "WILLIS ROAD FLAT 2 PRE EPR SITE NOTES.pdf", + "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 WILLIS " + "ROAD FLAT 2 PRE EPR PDF.pdf", + "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data" + "/Flat 2/3 WILLIS ROAD FLAT 2 POST EPR SITE NOTES.pdf" + }, + { + "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 " + "WILLIS ROAD FLAT 3 PRE EPR SITE NOTES.pdf", + "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 WILLIS " + "ROAD FLAT 3 PRE EPR PDF.pdf", + "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data" + "/Flat 3/3 WILLIS ROAD FLAT 3 POST EPR SITE NOTES.pdf" + }, + ] + + data = [] + for data_config in folders: + + file_mapping = {} + for filename, filepath in data_config.items(): + with (open(filepath, "rb") as f): + pdf = PyPDF2.PdfReader(f) + first_page = pdf.pages[0].extract_text() + text = "" + for page in pdf.pages: + text += page.extract_text() + + # Check the report type + report_type = detect_report_type(first_page) + if report_type is not None: + file_mapping[filename] = text + + # This is only set up to work with quido site notes so we must have it + site_notes_extractor = SiteNotesExtractor(file_mapping["site_notes"]) + site_notes = site_notes_extractor.extract_all() + + # We also must have an EPR + epr_extractor = EPRExtractor(file_mapping["epr"]) + epr = epr_extractor.extract_all() + + # Valuation simulation + scenario_site_notes_extractor = SiteNotesExtractor(file_mapping["scenario_site_notes"]) + scenario_site_notes = scenario_site_notes_extractor.extract_all() + + from backend.ml_models.Valuation import PropertyValuation + valuation_uplift = PropertyValuation.estimate_valuation_improvement( + current_value=current_property_value, + current_epc=site_notes["Current EPC Band"], + target_epc=scenario_site_notes["Current EPC Band"], + ) + # TODO - should convert this, when it's more than 5 figures and we should certainly stringify this + + valuation_difference = round(valuation_uplift["average_increased_value"] - current_property_value) + + # Prepare the data for output + bill_savings = round( + site_notes['Estimated Annual Energy Cost (£)'] - scenario_site_notes['Estimated Annual Energy Cost (£)'] + ) + + carbon_savings = round( + site_notes["Current Carbon Emissions (TCO2)"] - scenario_site_notes["Current Carbon Emissions (TCO2)"], + 2 + ) + + payback_period = None + if payback_period is None: + raise NotImplementedError("Implement me") + + # We extract the measures from the site notes + + report_data = { + "current_epc_rating": site_notes["Current EPC Band"], + "current_epc_rating_colour": EPC_COLOURS[site_notes["Current EPC Band"]], + "post_retrofit_epc_rating": scenario_site_notes["Current EPC Band"], + "post_retrofit_epc_rating_colour": EPC_COLOURS[scenario_site_notes["Current EPC Band"]], + "bill_savings": stringify_number(bill_savings), + "valuation_improvement": stringify_number(valuation_difference), + "carbon_savings": carbon_savings, + + } + + # We now produce the combined data sheet which is the starting figure: + # data_sheet = {**epr, **site_notes} + # del data_sheet['Building Dimensions'] + # # We unnest the Total Building Dimensions + # data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"] + # data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"] + # del data_sheet["Total Building Dimensions"] + + create_pdf_response = placid_api.create_pdf( + template_uuid=TEMPLATE_UUID, **report_data + ) + # {'id': 769832, 'type': 'pdf', 'status': 'queued', 'pdf_url': None, 'transfer_url': None, 'passthrough': None} + # Download locally + placid_api.get_pdf(create_pdf_response["id"]) + + data = pd.DataFrame(data) + + # Generate the HTML report + # Placeholder locations + template_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/template.html" + output_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/output/report.html" + logo_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/assets/logo.png" + generate_html_report( + template_path, output_path, + data={ + "address": data_sheet["Address"], + "logo_path": logo_path, + "current_epc": data_sheet["Current EPC Band"], + "current_sap": data_sheet["Current SAP Rating"], + "potential_epc": "A", # TODO PLACEHOLDER + "potential_sap": 91, # TODO PLACEHOLDER + } + ) diff --git a/survey_report/extraction/detect_report_type.py b/survey_report/extraction/detect_report_type.py new file mode 100644 index 00000000..434a3fb4 --- /dev/null +++ b/survey_report/extraction/detect_report_type.py @@ -0,0 +1,22 @@ +import re + + +def detect_report_type(first_page): + """ + Detects the type of report based on the first page of the report + :param first_page: + :return: + """ + # Set up for the minute to handle quidos files. We have the Elmhurst logic so we can introduce + # this when we need + + if re.match( + r"^Created \d{2}/\d{2}/\d{4} for Quidos Ltd using Argyle software BRE approved calculator", + first_page + ): + return "quidos_site_notes" + + if re.search(r"\nIQ-Energy\nEnergy Performance Report\nPage 1 of 1", first_page): + return "quidos_epr" + + return None diff --git a/survey_report/extraction/quidos.py b/survey_report/extraction/quidos.py new file mode 100644 index 00000000..2e772886 --- /dev/null +++ b/survey_report/extraction/quidos.py @@ -0,0 +1,256 @@ +import re + + +class SiteNotesExtractor: + """ + Extracts SAP rating, carbon emissions, and building dimensions from an EPC summary report. + """ + + def __init__(self, pdf_text): + """ + Initializes the SiteNotesExtractor with the extracted PDF text. + """ + self.text = pdf_text + self.data = {} + + def extract_sap_rating(self): + """ + Extracts the current and potential SAP rating from the report. + """ + pattern = re.search(r"Current SAP rating\s*([A-G])\s*(\d+)\s*Potential SAP rating\s*([A-G])\s*(\d+)", self.text) + + if not pattern: + raise ValueError("No SAP rating found in the report") + + self.data.update({ + "Current EPC Band": pattern.group(1), + "Current SAP Rating": int(pattern.group(2)), + "Potential EPC Band": pattern.group(3), + "Potential SAP Rating": int(pattern.group(4)), + }) + + def extract_carbon_emissions(self): + """ + Extracts the current and adjusted annual carbon emissions (TCO2). + """ + pattern = re.search(r"Current annual emissions\s*([\d.]+)\s*\(TCO2\)", self.text) + + if not pattern: + raise ValueError("No carbon emissions found in the report") + + self.data.update({ + "Current Carbon Emissions (TCO2)": float(pattern.group(1)), + }) + + def extract_building_dimensions(self): + """ + Extracts dimensions for each building part and stores them in a list. + Handles Main Property and multiple extensions. + """ + + # Locate the Dimensions section + dimensions_section = re.search( + r"Dimension Type (?:internal|external)\nPart Floor Area \(m2\) Room Height \(m\) Loss Perimeter \(m\) " + r"Party Wall " + r"Length \(m\)\n" + r"(.*?)\n5\.0 Conservatory", self.text, re.DOTALL + ) + + if not dimensions_section: + raise ValueError("Failed to locate the dimensions section in the text.") + + dimensions_text = dimensions_section.group(1) + + # Pattern to match each building part (Main Property, Extension 1, Extension 2, etc.) + building_part_pattern = re.compile( + r"(Main Property|Extension \d+)\s*(?:Property)?\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + ) + + building_parts = [] + for match in building_part_pattern.finditer(dimensions_text): + to_append = { + "Building Part": match.group(1).strip(), + "Part Floor Area (m2)": float(match.group(2)), + "Room Height (m)": float(match.group(3)), + "Loss Perimeter (m)": float(match.group(4)), + "Party Wall Length (m)": float(match.group(5)), + } + # We calculate the heat loss area + to_append["Heat Loss Area (m2)"] = to_append["Loss Perimeter (m)"] * to_append["Room Height (m)"] + building_parts.append(to_append) + + if not building_parts: + raise ValueError("No building dimensions found in the report") + + self.data["Building Dimensions"] = building_parts + # We calculate some totals + self.data["Total Building Dimensions"] = { + "floor_area": sum([part["Part Floor Area (m2)"] for part in building_parts]), + "heat_loss_area": sum([part["Heat Loss Area (m2)"] for part in building_parts]), + } + + def extract_bills_estimate(self): + """ + Extracts the estimated annual energy costs (£) from the report. + """ + pattern = re.search(r"Current annual energy costs £\s*([\d,.]+)", self.text) + + if not pattern: + raise ValueError("No bills estimate found in the report") + + self.data["Estimated Annual Energy Cost (£)"] = float(pattern.group(1).replace(",", "")) + + def extract_all(self): + """ + Runs all extraction methods and returns a dictionary with extracted data. + """ + self.extract_sap_rating() + self.extract_carbon_emissions() + self.extract_bills_estimate() + self.extract_building_dimensions() + + # Extract specific measures + # Primary wall + # Secondary wall + # Roof + # Floor + # Heating system + # Hot water system + # Windows + # Doors + # Lighting + # Ventilation + # Solar + + return self.data + + def extract_walls(self): + """ + Extracts wall type, insulation, dry-lining, and thickness for each building part, + including any alternative wall details within the 7.0 Walls section of the summary PDF text. + """ + + text = self.text + wall_data = [] + + # Isolate the 7.0 Walls section + wall_section_match = re.search(r"7\.0 Walls\n(.*?)\n8\.0 Roofs", text, re.DOTALL) + if not wall_section_match: + raise ValueError("Failed to locate the walls section in the text.") + + wall_section = wall_section_match.group(1) + + # Define patterns to match walls for each building part + wall_pattern = re.compile( + r"(?P
Main Property(?: Alternative)?|Extension \d+)\s*\n" + r"(?:Construction\s*(?P[^\n]*)\n)?" + r"(?:Insulation\s*(?P[^\n]*)\n)?" + r"(?:Insulation Thickness\(mm\)\s*(?P[^\n]*)\n)?" + r"(?:Wall Thickness Measured\?\s*(?P[^\n]*)\n)?" + r"(?:Wall Thickness\(mm\)\s*(?P\d+))?", + re.MULTILINE + ) + + # TODO: We aren't effectively picking up alternative walls + # alt_wall_pattern = re.compile( + # r"Alternative Wall Sheltered\s*.*?\n" + # r".*?Construction\s*(?P[^\n]*)\n" + # r"Insulation\s*(?P[^\n]*)\n" + # r"Insulation Thickness\(mm\)\s*(?P[^\n]*)\n" + # r"Wall Thickness Measured\?\s*(?P[^\n]*)\n" + # r"Wall Thickness\(mm\)\s*(?P\d+)?", + # re.MULTILINE + # ) + + for match in wall_pattern.finditer(wall_section): + building_part = match.group("section") + # has_alternative_wall = "Alternative" in building_part + building_part = "Main Property" if "Main Property" in building_part else building_part + + wall_entry = { + "Building Part": building_part, + "Wall Type": match.group("construction") or "Unknown", + "Wall Insulation": match.group("insulation") or "Unknown", + "Insulation Thickness (mm)": match.group("insulation_thickness") or "Unknown", + "Wall Thickness Measured": match.group("thickness_measured") or "Unknown", + "Wall Thickness (mm)": int(match.group("thickness")) if match.group("thickness") and match.group( + "thickness").isdigit() else None, + "Alternative Wall Type": None, + "Alternative Wall Insulation": None, + "Alternative Insulation Thickness (mm)": None, + "Alternative Wall Thickness Measured": None, + "Alternative Wall Thickness (mm)": None, + } + + # Check if an alternative wall section exists + # if has_alternative_wall: + # alt_match = alt_wall_pattern.search(wall_section, match.end()) + # if alt_match: + # wall_entry["Alternative Wall Type"] = alt_match.group("alt_construction") or "Unknown" + # wall_entry["Alternative Wall Insulation"] = alt_match.group("alt_insulation") or "Unknown" + # wall_entry["Alternative Insulation Thickness (mm)"] = alt_match.group( + # "alt_insulation_thickness") or "Unknown" + # wall_entry["Alternative Wall Thickness Measured"] = alt_match.group( + # "alt_thickness_measured") or "Unknown" + # wall_entry["Alternative Wall Thickness (mm)"] = int( + # alt_match.group("alt_thickness")) if alt_match.group("alt_thickness") and alt_match.group( + # "alt_thickness").isdigit() else None + + wall_data.append(wall_entry) + + return wall_data + + +class EPRExtractor: + """ + Extracts space heating, water heating, and address from an Energy Performance Report (EPR). + """ + + def __init__(self, pdf_text): + """ + Initializes the EPRExtractor with the extracted PDF text. + """ + self.text = pdf_text + self.data = {} + + def extract_heating_consumption(self): + """ + Extracts space heating and water heating values from the report. + """ + pattern = re.search( + r"Space Heating\(KWH\)\s*([\d,]+).*?\nWater Heating\(KWH\)\s*([\d,]+)", + self.text, + re.DOTALL + ) + + if not pattern: + raise ValueError("No heating data found in the report") + + self.data.update({ + "Space Heating (KWH)": int(pattern.group(1).replace(",", "")), + "Water Heating (KWH)": int(pattern.group(2).replace(",", "")) + }) + + def extract_address(self): + """ + Extracts the full address from the report. + """ + pattern = re.search( + r"Address\s*(.*?)\nTown\s*(.*?)\n", + self.text, + re.DOTALL + ) + + if not pattern: + raise ValueError("No address found in the report") + + full_address = pattern.group(1).strip() + self.data["Address"] = full_address + + def extract_all(self): + """ + Runs all extraction methods and returns a dictionary with extracted data. + """ + self.extract_address() + self.extract_heating_consumption() + return self.data diff --git a/etl/route_march_data_pull/requirements.txt b/survey_report/requirements.txt similarity index 100% rename from etl/route_march_data_pull/requirements.txt rename to survey_report/requirements.txt diff --git a/survey_report/template.html b/survey_report/template.html new file mode 100644 index 00000000..5d3b6c63 --- /dev/null +++ b/survey_report/template.html @@ -0,0 +1,123 @@ + + + + + + Domna Energy Report + + + + +
+ +
+
+

Domna Energy Report

+

${address}

+
+ +
+ + +
+
+
Current EPC Rating
+
${current_epc}
+
SAP ${current_sap}
+
+ +
+
Potential EPC Rating
+
${potential_epc}
+
SAP ${potential_sap}
+
+
+ +
+ + +