diff --git a/.gitignore b/.gitignore index 63884ad7..5e247d77 100644 --- a/.gitignore +++ b/.gitignore @@ -268,4 +268,11 @@ adhoc adhoc/* etl-router-venv/ -refactor_datasets/ \ No newline at end of file +refactor_datasets/ + +etl/eligibility/ha_15_32/ +cache/ +*/.idea + +*.png +*.pptx \ No newline at end of file diff --git a/.idea/terraform.xml b/.idea/terraform.xml new file mode 100644 index 00000000..cd46a3d3 --- /dev/null +++ b/.idea/terraform.xml @@ -0,0 +1,6 @@ + + + + + \ No newline at end of file diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py new file mode 100644 index 00000000..af5a3faf --- /dev/null +++ b/asset_list/AssetList.py @@ -0,0 +1,2436 @@ +import hashlib +import os +import re +import tiktoken +from pprint import pprint +from datetime import datetime + +from openai import OpenAI +import numpy as np +import pandas as pd +from tqdm import tqdm +from fuzzywuzzy import process +from utils.logger import setup_logger +from backend.SearchEpc import SearchEpc +from BaseUtility import Definitions +import asset_list.mappings.property_type as property_type_mappings +import asset_list.mappings.walls as walls_mappings +import asset_list.mappings.heating_systems as heating_mappings +import asset_list.mappings.exising_pv as existing_pv_mappings +import asset_list.mappings.built_form as built_form_mappings +import asset_list.mappings.roof as roof_mappings + +from recommendations.recommendation_utils import ( + estimate_perimeter, + estimate_external_wall_area, + estimate_number_of_floors +) + +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes + +logger = setup_logger() + +# OpenAI API Key (set this in your environment variables for security) +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") + + +class DataRemapper: + def __init__(self, standard_values, standard_map=None, max_tokens=1000): + """ + Initialize the remapper with standard values and a predefined mapping. + + :param standard_values: Set of allowed standardized values. + :param standard_map: Dictionary of common remappings {raw_value: standard_value}. + """ + self.standard_values = standard_values + self.standard_map = standard_map + self.fuzzy_threshold = 90 # Adjust fuzzy matching sensitivity + self.ai_model = "gpt-4-turbo" # Use gpt-3.5-turbo for cheaper processing + + # Tokenizer for counting tokens + self.tokenizer = tiktoken.encoding_for_model(self.ai_model) + + # Track token usage and remap dictionary + self.total_tokens_used = 0 + self.total_cost = 0 + self.remap_dict = {} # {original_value: standardized_value} + self.max_tokens = max_tokens # Limit for OpenAI API + + # Memoization for AI calls + self.ai_cache = {} # {tuple(unmapped_values): {original_value: standardized_value}} + # Capture the reponse for debugging + self.ai_response = None + + # OpenAI pricing (as of Feb 2024) + self.pricing = { + "gpt-4-turbo": {"input": 0.01 / 1000, "output": 0.03 / 1000}, + "gpt-3.5-turbo": {"input": 0.0015 / 1000, "output": 0.002 / 1000}, + } + + self.openai_client = OpenAI(api_key=OPENAI_API_KEY) + + @staticmethod + def clean_string(text): + """Basic text cleaning: remove extra spaces, punctuation, and normalize case.""" + if not isinstance(text, str): + return None + text = text.strip().lower() + text = re.sub(r'[^\w\s]', '', text) # Remove punctuation + # Replace double strings + text = re.sub(r'\s+', ' ', text) + return text + + def fuzzy_match(self, text): + """Use fuzzy matching to find the closest standard value.""" + match, score = process.extractOne(text, self.standard_values) if text else (None, 0) + return match if score >= self.fuzzy_threshold else None + + def count_tokens(self, text): + """Estimate the number of tokens in a given text.""" + return len(self.tokenizer.encode(text)) if text else 0 + + def ai_standardize(self, unmapped_values): + """Call OpenAI API **once** for all unmapped values to minimize cost, with memoization.""" + if not unmapped_values: + return {} + + unmapped_tuple = tuple(sorted(unmapped_values)) # Ensure consistency for memoization + if unmapped_tuple in self.ai_cache: + return self.ai_cache[unmapped_tuple] # Return memoized result + + prompt = f""" + You are an expert in data classification. Standardize each of these values into one of the categories: + {list(self.standard_values)}. + + Return only a JSON dictionary where: + - The keys are the original values. + - The values are the standardized ones. + + Strictly return JSON **without markdown formatting** or extra text. + + Example Output: + {{ + "BLKHOUS": "block house", + "BEDSIT": "bedsit" + }} + + Values to standardize: + {unmapped_values} + """ + + # Count input tokens + input_tokens = self.count_tokens(prompt) + if input_tokens > self.max_tokens: + raise ValueError("Input tokens exceed the maximum limit.") + + logger.info("Calling OpenAI API for standardization...") + response = self.openai_client.chat.completions.create( + model=self.ai_model, + messages=[{"role": "user", "content": prompt}], + max_tokens=self.max_tokens, + temperature=0.1, + ) + + output_text = response.choices[0].message.content.strip() + output_tokens = self.count_tokens(output_text) # Count output tokens + + # Track total token usage + self.total_tokens_used += input_tokens + output_tokens + + # Estimate cost + input_cost = input_tokens * self.pricing[self.ai_model]["input"] + output_cost = output_tokens * self.pricing[self.ai_model]["output"] + self.total_cost += input_cost + output_cost + + try: + # Parse response as dictionary + mapping = eval(output_text) # OpenAI should return a valid dictionary + except: + mapping = {val: "unknown" for val in unmapped_values} # Fallback + + # Memoize the AI response + self.ai_cache[unmapped_tuple] = mapping + # We store the raw AI response for debugging + logger.debug(f"AI Response: {mapping}") + self.ai_response = output_text + + return mapping + + def standardize_list(self, values_to_remap): + """ + Standardizes a list of values and returns a dictionary {original_value: standardized_value}. + + :param values_to_remap: List of raw values to standardize. + :return: Dictionary {original_value: standardized_value}. + """ + unique_values = set(values_to_remap) # Process only unique values + + unmapped_values = [] + for value in unique_values: + if pd.isna(value): # Handle NaN values + self.remap_dict[value] = "unknown" + continue + + cleaned_value = self.clean_string(value) + + # Rule-Based Check (Predefined Mapping) + if cleaned_value in self.standard_map or value in self.standard_map: + self.remap_dict[value] = ( + self.standard_map[cleaned_value] if cleaned_value in self.standard_map else self.standard_map[value] + ) + continue + + if value.lower() in self.standard_map: + self.remap_dict[value] = self.standard_map[value.lower()] + continue + + # Exact Match in Standard Values + if cleaned_value in self.standard_values: + self.remap_dict[value] = cleaned_value + continue + + # Fuzzy Matching + fuzzy_match = self.fuzzy_match(cleaned_value) + if fuzzy_match: + self.remap_dict[value] = fuzzy_match + continue + + # Capture anything that wasn't mapped + unmapped_values.append(value) + + # AI Model - remap anything unmapped (batch request) + ai_mapping = self.ai_standardize(unmapped_values) + self.remap_dict.update(ai_mapping) + + return self.remap_dict + + def report_usage(self): + """Prints a summary of token usage and cost.""" + print(f"\n🔹 Total Tokens Used: {self.total_tokens_used}") + print(f"💰 Estimated Cost: ${self.total_cost:.4f}") + + +class AssetList: + """ + This class is used to standardise asset lists so that we can process the core information in a consistent manner. + """ + + EPC_API_DATA_NAMES = { + "uprn": "epc_os_uprn", + "address1": "epc_address1", + "address": "epc_address", + "postcode": "epc_postcode", + "inspection-date": "epc_inspection_date", + "current-energy-efficiency": "epc_sap_score_on_register", + "current-energy-rating": "epc_rating_on_register", + "property-type": "epc_property_type", + "built-form": "epc_archetype", + "total-floor-area": "epc_total_floor_area", + "construction-age-band": "epc_age_band", + "floor-height": "epc_floor_height", + "number-habitable-rooms": "epc_number_habitable_rooms", + "walls-description": "epc_wall_construction", + "roof-description": "epc_roof_construction", + "floor-description": "epc_floor_construction", + "mainheat-description": "epc_heating_type", + 'mainheatcont-description': "epc_heating_controls", + "secondheat-description": "epc_secondary_heating", + "transaction-type": "epc_reason", + "energy-consumption-current": "epc_heat_demand", + "photo-supply": "epc_photo_supply", + "estimated": "estimated" + } + FIND_EPC_DATA_NAMES = { + "heating_text": "epc_estiamted_heating_kwh", + "hot_water_text": "epc_estimated_hotwater_kwh", + 'Assessor’s name': "epc_assessor_name", + "Assessor's Telephone": "epc_assessor_telephone", + "Assessor's Email": "epc_assessor_email", + "Accreditation scheme": "epc_assessor_accreditation", + "Assessor’s ID": "epc_assessor_id", + "Solar photovoltaics": "epc_solar_pv" + } + + DATETIME_REMAP = { + "Pre 1900": datetime(year=1899, month=12, day=31), + } + + # These are the accepted methods we have for cleaning the address1 column + ADDRESS_1_CLEANING_METHODS = [ + "first_two_words", # This method will split on the fist two words, where the separator is a space + "first_word", # This method will split on the first word, where the separator is a space + "house_number_extraction", # This method will use the NLP model in SearchEPC to extract the housenumber + # "address1_extraction" # This method will use the NLP model to extract address1 + ] + + # Standard column Names + STANDARD_ADDRESS_1 = "domna_address_1" + STANDARD_POSTCODE = "domna_postcode" + STANDARD_FULL_ADDRESS = "domna_full_address" + STANDARD_YEAR_BUILT = "landlord_year_built" + STANDARD_UPRN = "ordnance_survey_uprn" + STANDARD_LANDLORD_PROPERTY_ID = "landlord_property_id" + STANDARD_PROPERTY_TYPE = "landlord_property_type" + STANDARD_BUILT_FORM = "landlord_built_form" + STANDARD_WALL_CONSTRUCTION = "landlord_wall_construction" + STANDARD_ROOF_CONSTRUCTION = "landlord_roof_construction" + STANDARD_HEATING_SYSTEM = "landlord_heating_system" + STANDARD_EXISTING_PV = "landlord_existing_pv" + STANDARD_SAP = "landlord_sap_rating" + + DOMNA_PROPERTY_ID = "domna_property_id" + + # Regular expression for identifying if the address might point to multiple units + MULTI_UNIT_REGEX = re.compile(r'\b([A-Za-z0-9]+)-([A-Za-z0-9]+)\b') + + # List of columns relating to the non-intrusive data + NON_INTRUSIVES_COLNAMES = [ + "Archetype", "Construction", "Insulated", "Material", "CIGA Check Required", + "PV, ACCESS ISSUE, SEE NOTES", "OFF GAS - ROOF ORIENTATION", + "Any further surveyor notes", 'Surveyors Name' + ] + + NON_INTRUSIVES_ELIGIBILITY_COLUMN = "Eligibility (Red/Yellow/Green)" + + OLD_FORMAT_NON_INTRUSIVE_COLNAMES = ['WFT Findings', 'ECO Eligibility'] + + # This SAP threshold is a key search criteria for properties that may be eligible for extraction + FILLED_CAVITY_SAP_THRESHOLD = 75 + # This SAP the + EMPTY_CAVITY_SAP_THRESHOLD = 75 + # Any EPC deemed to have been conducted prior to this year is deemed to be unreliable + EPC_YEAR_THRESHOLD = pd.Timestamp.now().year - 5 + + # Properties before this year are more likely to have lower EPC ratings and more likely to qualify + EMPTY_CAVITY_YEAR_THRESHOLD = 2002 + + # Attributes - these are columns that we produce, calcualted based on other pieces of data + ATTRIBUTE_HAS_SOLAR = "attribute_has_solar" + ATTRIBUTE_NUMBER_OF_FLOORS = "attribute_est_number_floors" + ATTRIBUTE_ESTIMATED_PERIMETER = "attribute_est_perimter" + ATTRIBUTE_HEAT_LOSS_AREA = "attribute_heat_loss_area" + ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS = "attribute_epc_roof_insulation_thickness" + ATTRIBUTE_SAP_THRESHOLD_AND_BELOW = f"sap_rating_{FILLED_CAVITY_SAP_THRESHOLD}_and_below" + ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD = f"epc_is_pre_{EPC_YEAR_THRESHOLD}" + + # These are the descriptions that we look for in the EPC data that are indicative of no insulation + EPC_NO_WALL_INSULATION_DESCRIPTIONS = [ + "cavity wall, as built, no insulation (assumed)", + "cavity wall, as built, partial insulation (assumed)", + "cavity wall, as built, partial insulation", + "cavity wall, as built, no insulation", + ] + + # List of strings that we look for in the EPC data, where substrings indicate that the wall is insulated + EPC_INSULATED_WALLS_SUBSTRINGS = [ + ", insulated", "with external insulation", "with internal insulation", "filled cavity" + ] + + # List of strings that we look for in the EPC data, where substrings indicate that the roof is insulated + EPC_INSULATED_ROOF_SUBSTRINGS = [ + "(another dwelling above)", ", insulated", ", insulated (assumed) ", + ", ceiling insulated", + ] + + # List of strings we look for in the EPC data, where substrings indicate that the cavity is empty + UNINSULATED_CAVITY_SUBSTRINGS = [ + "cavity wall, as built, no insulation (assumed)", + "cavity wall, as built, no insulation", + "cavity wall, as built, partial insulation (assumed)", + "cavity wall, as built, partial insulation", + ] + + def __init__( + self, + local_filepath, + sheet_name, + address1_colname, + postcode_colname, + full_address_colname, + landlord_property_id=None, + full_address_cols_to_concat=None, + missing_postcodes_method=None, + address1_extraction_method=None, + landlord_year_built=None, + landlord_uprn=None, + landlord_property_type=None, + landlord_built_form=None, + landlord_wall_construction=None, + landlord_roof_construction=None, + landlord_heating_system=None, + landlord_existing_pv=None, + landlord_sap=None, + phase=False, + header=0 + ): + self.local_filepath = local_filepath + self.sheet_name = sheet_name + # Read in the data + if local_filepath.endswith(".xlsx"): + self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name) + else: + self.raw_asset_list = pd.read_csv(local_filepath) + self.standardised_asset_list = self.raw_asset_list.copy() + # Will be used to store aggregated figures against the various work types + self.work_type_figures = {} + self.flat_data = None + self.duplicated_addresses = None + self.contact_details = None + self.contact_detail_fields = None + self.outcomes = None + self.outcomes_no_match = pd.DataFrame() + self.outcomes_for_output = pd.DataFrame() + self.master_surveyed = None + self.unmatched_submissions = pd.DataFrame() + + # When this is True, we intend to break the programme into multiple phases. We may need to review + # how this is structured in the future, as depending on how we get future data, we may need to + # remove some existing phases from the reporting, or specifically highlight the phase (1 to n-1) + # properties, assuming the current phase is n. + self.phase = phase + + # We detect the presence of the non-intrusive columns + self.non_intrusives_present = "CIGA Check Required" in self.raw_asset_list.columns + # We detect if we have the old format of non-intruvies + self.old_format_non_intrusives_present = "WFT Findings" in self.raw_asset_list.columns + + self.non_intrusives_eligibility = "Eligibility (Red/Yellow/Green)" in self.raw_asset_list.columns + + # Names of columns + self.landlord_property_id = landlord_property_id + self.address1_colname = address1_colname + self.postcode_colname = postcode_colname + self.full_address_colname = full_address_colname + self.landlord_year_built = landlord_year_built + self.landlord_uprn = landlord_uprn + self.landlord_property_type = landlord_property_type + self.landlord_built_form = landlord_built_form + self.landlord_wall_construction = landlord_wall_construction + self.landlord_roof_construction = landlord_roof_construction + self.landlord_heating_system = landlord_heating_system + self.landlord_existing_pv = landlord_existing_pv + self.landlord_sap = landlord_sap + + # parameters for cleaning + self.full_address_cols_to_concat = full_address_cols_to_concat + self.missing_postcodes_method = missing_postcodes_method + self.address1_extraction_method = address1_extraction_method + + self.debug_information = { + "property_type": None, + "wall_construction": None, + "heating_system": None, + "existing_pv": None + } + + self.variable_mappings = {} + self.hubspot_data = None + + self.rename_map = {} + self.keep_variables = [] + + # Finally, we handle the case where the landlord's property ID is actually the OS UPRN + if (self.landlord_uprn == self.landlord_property_id) and (self.landlord_property_id is not None): + self.standardised_asset_list[self.STANDARD_UPRN] = self.standardised_asset_list[self.landlord_uprn].copy() + # Update the reference to landlord UPRn + self.landlord_uprn = self.STANDARD_UPRN + + # Handle the case when full address and address 1 are the same + if self.full_address_colname == self.address1_colname: + self.full_address_colname = self.STANDARD_FULL_ADDRESS + self.standardised_asset_list[self.full_address_colname] = ( + self.standardised_asset_list[self.address1_colname].copy() + ) + + # Handle the case where the property type column is the same as the built type + if self.landlord_property_type == self.landlord_built_form: + self.landlord_built_form = self.STANDARD_BUILT_FORM + self.standardised_asset_list[self.landlord_built_form] = ( + self.standardised_asset_list[self.landlord_property_type].copy() + ) + + # If landlord built form is None (which it often is) we use the built for from inspections + if (self.landlord_built_form is None) and self.non_intrusives_present: + self.landlord_built_form = self.STANDARD_BUILT_FORM + self.standardised_asset_list[self.landlord_built_form] = ( + self.standardised_asset_list["Archetype"].copy() + ) + + def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"): + + if method not in self.ADDRESS_1_CLEANING_METHODS: + raise ValueError(f"Method {method} for producing address1 not recognized") + + if method == "first_two_words": + asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") + return asset_list + + if method == "first_word": + asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[0] + return asset_list + + if method == "house_number_extraction": + asset_list[self.address1_colname] = asset_list.apply( + lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]), + axis=1 + ) + + for _, x in asset_list.iterrows(): + SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]) + return asset_list + + raise ValueError(f"Method {method} not recognized") + + @staticmethod + def _address1_extraction(x): + pass + + def create_property_id(self): + """ + This function creates the domna property ID, which is simply a hash of the full address and postcode + We want all figures to be positive + :return: + """ + + # We'll remove punctuation and whitespace from the address, before hashing to produce an ID + + def _make_hash(value): + """Generates a stable SHA256 hash suffix and appends it to a cleaned version of the value.""" + # Normalize and remove special characters for cleaner ID + cleaned_value = re.sub(r"[^\w\s-]", "", value).replace(" ", "_").lower() + + # Generate SHA-256 hash and truncate it + short_hash = hashlib.sha256(value.encode()).hexdigest()[:12] + + return f"{cleaned_value}-{short_hash}" + + # Apply transformation + self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = ( + self.standardised_asset_list[self.full_address_colname] + + self.standardised_asset_list[self.postcode_colname] + ).str.strip().str.replace(r"[^\w\s]", "", regex=True).str.replace(" ", "").str.lower().apply(_make_hash) + + @staticmethod + def _strip_postcode_from_full_address(full_address, postcode): + cleaned = full_address.replace(postcode, "") + # Remove any trailing commas and spaces + cleaned = cleaned.rstrip(", ").strip(",").strip() + return cleaned + + @classmethod + def _identify_multi_address(cls, address): + # We check if the address is comma separated + if "," in address: + address1_section = address.split(",")[0] + # We look for string in the form (x-y) + return bool(cls.MULTI_UNIT_REGEX.search(address1_section)) + + @staticmethod + def _convert_uprn(x): + """ + Used to convert UPRNS to integer strings + :param x: uprn to convert + :return: converted uprn + """ + + if pd.isnull(x): + return x + + # check if numeric + if np.isreal(x): + return str(int(x)) + + if str(x).isdigit(): + return str(int(x)) + return x + + @staticmethod + def _clean_postcode(postcode): + # Remove double spaces + postcode = postcode.replace(" ", " ") + if " " not in postcode: + # Restructure it + return " ".join( + [postcode[:-3], postcode[-3:]] + ) + + return postcode + + def init_standardise(self): + """ + This function is used to standardise the asset list + :return: standardised asset list + """ + + # Remove rows without a postcode + if self.postcode_colname is not None: + self.standardised_asset_list = self.standardised_asset_list.dropna(subset=[self.postcode_colname]) + # We also clean postcode columns where if there is not space, we create one + self.standardised_asset_list[self.postcode_colname] = self.standardised_asset_list[ + self.postcode_colname + ].apply(self._clean_postcode) + + # We clean up portential non-breaking spaces, and double spaces + for col in [ + c for c in [self.postcode_colname, self.full_address_colname, self.address1_colname] if + c is not None + ]: + self.standardised_asset_list[col] = self.standardised_asset_list[col].astype(str) + self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace('\xa0', ' ', regex=False) + self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace(' ', ' ', regex=False) + + if self.address1_colname is None: + if self.address1_extraction_method is None: + raise ValueError("Missing address 1 - please specify an extraction method") + self.address1_colname = self.STANDARD_ADDRESS_1 + # If we do not have this, we produce it + self.standardised_asset_list = self._extract_address1( + asset_list=self.standardised_asset_list, + full_address_col=self.full_address_colname, + postcode_col=self.postcode_colname, + method=self.address1_extraction_method + ) + + if self.full_address_colname is None: + if not self.full_address_cols_to_concat: + raise ValueError("Missing full address - please specify columns to concatenate") + self.full_address_colname = self.STANDARD_FULL_ADDRESS + self.standardised_asset_list[self.full_address_colname] = ( + self.standardised_asset_list[self.full_address_cols_to_concat].apply( + lambda x: ", ".join([y for y in x if not pd.isnull(y)]), + axis=1 + ) + ) + else: + + # Make sure to strip the postcode out of the full address + self.standardised_asset_list[self.full_address_colname] = self.standardised_asset_list.apply( + lambda x: self._strip_postcode_from_full_address( + full_address=x[self.full_address_colname], + postcode=x[self.postcode_colname] + ), + axis=1 + ) + + # We create the domna property id + self.create_property_id() + + # Clean up the UPRN column, if the landlord has provided them + if self.landlord_uprn is not None: + self.standardised_asset_list[self.landlord_uprn] = ( + self.standardised_asset_list[self.landlord_uprn].apply(self._convert_uprn) + ) + + # We keep just the columns we care about and will work through the various columns and standardise + variables = [ + self.landlord_property_id, + self.DOMNA_PROPERTY_ID, + self.address1_colname, + self.postcode_colname, + self.full_address_colname, + self.landlord_uprn, + self.landlord_property_type, + self.landlord_built_form, + self.landlord_year_built, + self.landlord_wall_construction, + self.landlord_roof_construction, + self.landlord_heating_system, + self.landlord_existing_pv, + self.landlord_sap, + ] + # Keep just non-null variables (e.g landlord may not provide uprn + self.keep_variables = [v for v in variables if v is not None] + self.rename_map = { + self.landlord_property_id: self.STANDARD_LANDLORD_PROPERTY_ID, + self.address1_colname: self.STANDARD_ADDRESS_1, + self.postcode_colname: self.STANDARD_POSTCODE, + self.full_address_colname: self.STANDARD_FULL_ADDRESS, + self.landlord_uprn: self.STANDARD_UPRN, + self.landlord_property_type: self.STANDARD_PROPERTY_TYPE, + self.landlord_built_form: self.STANDARD_BUILT_FORM, + self.landlord_year_built: self.STANDARD_YEAR_BUILT, + self.landlord_wall_construction: self.STANDARD_WALL_CONSTRUCTION, + self.landlord_roof_construction: self.STANDARD_ROOF_CONSTRUCTION, + self.landlord_heating_system: self.STANDARD_HEATING_SYSTEM, + self.landlord_existing_pv: self.STANDARD_EXISTING_PV, + self.landlord_sap: self.STANDARD_SAP, + } + self.rename_map = {k: v for k, v in self.rename_map.items() if k is not None} + + non_intrusive_columns = [] + if self.non_intrusives_present: + non_intrusive_columns = self.NON_INTRUSIVES_COLNAMES + + if self.non_intrusives_eligibility: + non_intrusive_columns.append(self.NON_INTRUSIVES_ELIGIBILITY_COLUMN) + + if self.old_format_non_intrusives_present: + # We check if we have the ECO Eligibility column, which we might not have + non_intrusive_columns = [ + c for c in self.OLD_FORMAT_NON_INTRUSIVE_COLNAMES if c in self.standardised_asset_list.columns + ] + + self.keep_variables += non_intrusive_columns + + self.rename_map = { + **self.rename_map, + **dict( + zip(non_intrusive_columns, ["non-intrusives: " + c for c in non_intrusive_columns]) + ) + } + + # We idenfiy addresses which are likely to be multi-addresses (i.g are rooms x-y) + self.standardised_asset_list["is_multi_address"] = self.standardised_asset_list[ + self.full_address_colname + ].apply(lambda x: self._identify_multi_address(x)) + + # We handle cleaning for walls, in the instance that the landlord provides us with EPC data and + # we see instances of "average thermal transmittance" in the description + if self.landlord_wall_construction is not None: + self.standardised_asset_list[self.landlord_wall_construction] = np.where( + self.standardised_asset_list[self.landlord_wall_construction].str.lower().str.contains( + "average thermal transmittance" + ) == True, + "new build - average thermal transmittance", + self.standardised_asset_list[self.landlord_wall_construction] + ) + else: + # We want to make sure that we have a column for wall construction + self.landlord_wall_construction = self.STANDARD_WALL_CONSTRUCTION + self.standardised_asset_list[self.landlord_wall_construction] = None + + if self.landlord_roof_construction is None: + self.landlord_roof_construction = self.STANDARD_ROOF_CONSTRUCTION + self.standardised_asset_list[self.landlord_roof_construction] = None + + # Clear our build year column + # We attempt to process the year built column + if self.landlord_year_built is not None: + # We check if we have a datetime - year built has not been renamed + if isinstance(self.standardised_asset_list[self.landlord_year_built].iloc[0], datetime): + # We treat any string columns - with common values we see + self.standardised_asset_list[self.landlord_year_built] = ( + self.standardised_asset_list[self.landlord_year_built].replace(self.DATETIME_REMAP) + ) + + self.standardised_asset_list[self.landlord_year_built] = pd.to_datetime( + self.standardised_asset_list[self.landlord_year_built] + ) + # Convert this to year + self.standardised_asset_list[self.landlord_year_built] = ( + self.standardised_asset_list[self.landlord_year_built].dt.year + ) + else: + # We attempt to convert the year built to a datetime, by detecting the format and converting + + def extract_year(date_str): + """ + Extracts the year from a date string in the format '01-Jul-YYYY'. + Returns the extracted year as an integer or None if the format is incorrect. + """ + known_errors = [ + "#MULTIVALUE", + "This cell has an external reference that can't be shown or edited. Editing this cell will " + "remove the external reference.", + "ND", + 'PIMSS EMPTY' + ] + + if pd.isnull(date_str) or date_str in known_errors or (date_str == 0): + return None + + if isinstance(date_str, str): + match = re.match(r"\d{1,2}-[A-Za-z]{3}-(\d{4})", date_str) + if match: + return int(match.group(1)) # Extract the year and convert to integer + if "-" in date_str: + + # Count the number of times we have "-", as we've seen double ranges + # (when we have extensions) so the format is like this: + # 'G: 1983-1990, H: 1991-1995' + if date_str.count("-") == 2: + # We have a range + return int(date_str.split("-")[1].split(",")[0]) + # We probably have a range + return int(date_str.split("-")[1].strip()) + + if isinstance(date_str, datetime): + return date_str.year + + if isinstance(date_str, float): + if str(int(date_str)).isdigit() & (len(str(int(date_str))) == 4): + return int(date_str) + + # Check if date_str is a year itself + if str(date_str).isdigit() & (len(str(date_str)) == 4): + return int(date_str) + + # Remove any non-numeric characters + date_str = re.sub(r"\D", "", str(date_str)) + if str(date_str).isdigit() & (len(str(date_str)) == 4): + return int(date_str) + + raise NotImplementedError(f"Unhandled format for year built, value is {date_str} - implement me") + + self.standardised_asset_list[self.landlord_year_built] = self.standardised_asset_list[ + self.landlord_year_built + ].apply(extract_year) + + # We now create standard lookups + to_remap = { + self.landlord_property_type: { + "standard_values": property_type_mappings.STANDARD_PROPERTY_TYPES, + "standard_map": property_type_mappings.PROPERTY_MAPPING + }, + self.landlord_built_form: { + "standard_values": built_form_mappings.STANDARD_BUILT_FORMS, + "standard_map": built_form_mappings.BUILT_FORM_MAPPINGS + }, + self.landlord_wall_construction: { + "standard_values": walls_mappings.STANDARD_WALL_CONSTRUCTIONS, + "standard_map": walls_mappings.WALL_CONSTRUCTION_MAPPINGS + }, + self.landlord_heating_system: { + "standard_values": heating_mappings.STANDARD_HEATING_SYSTEMS, + "standard_map": heating_mappings.HEATING_MAPPINGS + }, + self.landlord_existing_pv: { + "standard_values": existing_pv_mappings.STANDARD_EXISTING_PV, + "standard_map": existing_pv_mappings.EXISTING_PV_MAPPINGS + }, + self.landlord_roof_construction: { + "standard_values": roof_mappings.STANDARD_ROOF_CONSTRUCTIONS, + "standard_map": roof_mappings.ROOF_CONSTRUCTION_MAPPINGS + } + } + # Keep just entries where the key is not None + to_remap = {k: v for k, v in to_remap.items() if k is not None} + + for variable, config in to_remap.items(): + logger.info("Standardising variable: %s", variable) + # Strip each of these columns + self.standardised_asset_list[variable] = self.standardised_asset_list[variable].str.strip() + values_to_remap = self.standardised_asset_list[variable].unique() + # We want to map this to our standardised list of property types we're interested in + remapper = DataRemapper(standard_values=config["standard_values"], standard_map=config["standard_map"]) + remap_dictionary = remapper.standardize_list(values_to_remap=values_to_remap.tolist()) + self.variable_mappings[variable] = remap_dictionary + + # We now print out the variable mappings, which can be reviewed by the user, before the final standardised + # asset list is returned + for variable, mapping in self.variable_mappings.items(): + pprint(f"Variable: {variable}") + pprint(mapping) + # Print a space + print("\n") + pprint("=======================================") + + def apply_standardiation(self, override_empty_mappings=False): + """ + This function applies the standardisation to the asset list + :param override_empty_mappings: If true, will override the check for empty mappings. This is only relevant + if there are no categories which need remapping which is highly unlikely + :return: + """ + + if self.phase: + # We filter on just the properties that have had an inspection + self.standardised_asset_list = self.standardised_asset_list[ + ~self.standardised_asset_list['Surveyors Name'].isin(["YET TO BE SURVEYED"]) + ] + + if not self.variable_mappings and not override_empty_mappings: + raise ValueError("Please run init_standardise first") + + logger.info("Applying standardisation to asset list") + + for variable, mapping in self.variable_mappings.items(): + self.standardised_asset_list[variable + "_original_from_landlord"] = ( + self.standardised_asset_list[variable].copy() + ) + self.standardised_asset_list[variable] = self.standardised_asset_list[variable].map(mapping) + + if self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum(): + # Drop the dupes + pprint( + f"There are {self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum()} duplicated " + f"addresses - dropping" + ) + + # Keep a record of duplicates + self.duplicated_addresses = self.standardised_asset_list[ + self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated() + ][[self.DOMNA_PROPERTY_ID, self.address1_colname, self.postcode_colname]].copy() + + self.standardised_asset_list = self.standardised_asset_list[ + ~self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated() + ] + + # Apply renames to our standard names + # Perform final variable selection and renaming: + + # We add the original columns to the keep variables + self.keep_variables += [ + k + "_original_from_landlord" for k in self.variable_mappings.keys() + ] + + self.standardised_asset_list = self.standardised_asset_list[self.keep_variables].rename( + columns=self.rename_map + ) + + # We fill any standard columns that are not in the data because they were not provided by the landlord + missing_variables = [ + v for v in [ + self.STANDARD_EXISTING_PV, + self.STANDARD_HEATING_SYSTEM, + self.STANDARD_UPRN, + self.STANDARD_PROPERTY_TYPE, + self.STANDARD_YEAR_BUILT, + self.STANDARD_WALL_CONSTRUCTION, + self.STANDARD_HEATING_SYSTEM, + self.STANDARD_EXISTING_PV + ] if v not in self.standardised_asset_list.columns + ] + for v in missing_variables: + self.standardised_asset_list[v] = None + + # Convert to string + self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID] = ( + self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID].astype(str) + ) + + def merge_data(self, df: pd.DataFrame): + """ + Used to insert data into the standardised asset list, based on the domna property id + :return: + """ + if self.DOMNA_PROPERTY_ID not in df.columns: + raise ValueError(f"Dataframe must contain the column {self.DOMNA_PROPERTY_ID}") + + if df[self.DOMNA_PROPERTY_ID].duplicated().sum(): + raise ValueError(f"{self.DOMNA_PROPERTY_ID} contains duplicated IDs") + + self.standardised_asset_list = self.standardised_asset_list.merge( + df, how="left", on=self.DOMNA_PROPERTY_ID + ) + + def extract_attributes(self, pull_epc=True): + # Used to extracty the typical attributes that we use to identify viable work + + self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR] = ( + self.standardised_asset_list[self.FIND_EPC_DATA_NAMES["Solar photovoltaics"]] | + ~self.standardised_asset_list[self.EPC_API_DATA_NAMES["photo-supply"]].isin(["0.0", 0, None, "", np.nan]) + ) + + accepted_epc_property_types = ["House", "Flat", "Bungalow", "Maisonette"] + + # The logic here is: + # 1) Take the property type provided by the HA themselves + # 2) In absence of that, take the EPC property type + # 3) Otherwise use None + self.standardised_asset_list[self.ATTRIBUTE_NUMBER_OF_FLOORS] = self.standardised_asset_list.apply( + lambda x: estimate_number_of_floors( + property_type=( + str(x[self.STANDARD_PROPERTY_TYPE]).title() if + str(x[self.STANDARD_PROPERTY_TYPE]).title() in accepted_epc_property_types else ( + x[self.EPC_API_DATA_NAMES["property-type"]] if not + pd.isnull(x[self.EPC_API_DATA_NAMES["property-type"]]) else None + ) + ) + ), + axis=1 + ) + + self.standardised_asset_list[self.EPC_API_DATA_NAMES["total-floor-area"]] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["total-floor-area"]].astype(float) + ) + # Replace "" value with None + self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]].replace("", None) + ) + self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]].astype(float) + ) + + # Estimate the perimeter + self.standardised_asset_list[self.ATTRIBUTE_ESTIMATED_PERIMETER] = self.standardised_asset_list.apply( + lambda x: estimate_perimeter( + floor_area=x[self.EPC_API_DATA_NAMES["total-floor-area"]] / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], + num_rooms=x[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] / x[self.ATTRIBUTE_NUMBER_OF_FLOORS], + ), axis=1 + ) + + self.standardised_asset_list[self.ATTRIBUTE_HEAT_LOSS_AREA] = self.standardised_asset_list.apply( + lambda x: estimate_external_wall_area( + num_floors=x[self.ATTRIBUTE_NUMBER_OF_FLOORS], + floor_height=( + float(x[self.EPC_API_DATA_NAMES["floor-height"]]) if + x[self.EPC_API_DATA_NAMES["floor-height"]] else 2.5 + ), + perimeter=x[self.ATTRIBUTE_ESTIMATED_PERIMETER], + built_form=x[self.EPC_API_DATA_NAMES["built-form"]] + ), + axis=1 + ) + + self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = self.standardised_asset_list.apply( + lambda x: RoofAttributes(description=x[self.EPC_API_DATA_NAMES["roof-description"]]).process()[ + "insulation_thickness"] if not pd.isnull( + x[self.EPC_API_DATA_NAMES["roof-description"]]) else None, + axis=1 + ) + self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = ( + self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].str.replace("+", "") + ) + + # We produce some additional fields + # 1) Is the SAP rating below C75 + self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].astype(float) <= + self.FILLED_CAVITY_SAP_THRESHOLD + ) + # 2) Flag anything where the EPC is older than 5 years + self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] = ( + pd.to_datetime( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["inspection-date"]] + ).dt.year < self.EPC_YEAR_THRESHOLD + ) + + self.process_age_band() + + def process_age_band(self): + processed_age_band = [] + for _, x in self.standardised_asset_list.iterrows(): + + if pd.isnull(x[self.EPC_API_DATA_NAMES["construction-age-band"]]) or ( + x[self.EPC_API_DATA_NAMES["construction-age-band"]] in Definitions.DATA_ANOMALY_MATCHES + ): + processed_age_band.append( + { + self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], + "epc_year_lower_bound": None, + "epc_year_upper_bound": None, + "does_age_band_match_epc_age_band": "No EPC Age Band" + } + ) + continue + + # We exatract the upper and lower bounds + if x[self.EPC_API_DATA_NAMES["construction-age-band"]] in [ + "England and Wales: 2007 onwards", "England and Wales: 2012 onwards" + ]: + year_lower_bound = 2007 if x[self.EPC_API_DATA_NAMES[ + "construction-age-band"]] == "England and Wales: 2007 onwards" else 2012 + + if pd.isnull(x[self.STANDARD_YEAR_BUILT]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] >= year_lower_bound + else "EPC Age Band is older than Year Built" + ) + + processed_age_band.append( + { + self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], + "epc_year_lower_bound": year_lower_bound, + "epc_year_upper_bound": None, + "does_age_band_match_epc_age_band": age_band_matches + } + ) + continue + + if x[self.EPC_API_DATA_NAMES["construction-age-band"]] == "England and Wales: before 1900": + + if pd.isnull(x[self.STANDARD_YEAR_BUILT]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] < 1900 + else "EPC Age Band is newer than Year Built" + ) + + processed_age_band.append( + { + self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], + "epc_year_lower_bound": None, + "epc_year_upper_bound": 1899, + "does_age_band_match_epc_age_band": age_band_matches + } + ) + continue + + if x[self.EPC_API_DATA_NAMES["construction-age-band"]].isdigit(): + + if pd.isnull(x[self.STANDARD_YEAR_BUILT]): + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] == int( + x[self.EPC_API_DATA_NAMES["construction-age-band"]] + ) + else "EPC Age Band is different from Year Built" + ) + + processed_age_band.append( + { + self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], + "epc_year_lower_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]), + "epc_year_upper_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]), + "does_age_band_match_epc_age_band": age_band_matches + } + ) + continue + + # Oherwise, we extract the upper and lower bounds + age_band = x[self.EPC_API_DATA_NAMES["construction-age-band"]].split(": ")[1] + lower_date, upper_date = age_band.split("-") + + if not x[self.STANDARD_YEAR_BUILT]: + age_band_matches = "No Year Built From Landlord" + else: + age_band_matches = ( + "EPC Age Band Matches Year Built" if (x[self.STANDARD_YEAR_BUILT] >= float(lower_date)) and ( + x[self.STANDARD_YEAR_BUILT] <= float(upper_date) + ) + else "EPC Age Band is older than Year Built" if x[self.STANDARD_YEAR_BUILT] > float(upper_date) + else "EPC Age Band is newer than Year Built" + ) + + processed_age_band.append( + { + self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID], + "epc_year_lower_bound": int(lower_date), + "epc_year_upper_bound": int(upper_date), + "does_age_band_match_epc_age_band": age_band_matches + } + ) + + processed_age_band = pd.DataFrame(processed_age_band) + + self.standardised_asset_list = self.standardised_asset_list.merge( + processed_age_band, how="left" + ) + + def identify_worktypes(self, cleaned): + + if self.STANDARD_SAP is not None: + # We add a SAP category for all work type identification + self.standardised_asset_list["SAP Category"] = np.where( + ( + (self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= 68) | + (self.standardised_asset_list[self.STANDARD_SAP] <= 68) + ), + "SAP Rating 68 or less", + np.where( + ( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= + self.EMPTY_CAVITY_SAP_THRESHOLD + ) | (self.standardised_asset_list[self.STANDARD_SAP] <= self.EMPTY_CAVITY_SAP_THRESHOLD) + ), + f"SAP Rating 69-{self.EMPTY_CAVITY_SAP_THRESHOLD}", + f"SAP Rating {self.EMPTY_CAVITY_SAP_THRESHOLD + 1} or more" + ) + ) + else: + # We add a SAP category for all work type identification + self.standardised_asset_list["SAP Category"] = np.where( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= 68, + "SAP Rating 68 or less", + np.where( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= + self.EMPTY_CAVITY_SAP_THRESHOLD + ), + f"SAP Rating 69-{self.EMPTY_CAVITY_SAP_THRESHOLD}", + f"SAP Rating {self.EMPTY_CAVITY_SAP_THRESHOLD + 1} or more" + ) + ) + + # Before we being, we identify if a property has solar already as we use this + # for identifying cavity jobs + if self.non_intrusives_present: + existing_solar_non_intrusives_check = ( + self.standardised_asset_list["non-intrusives: PV, ACCESS ISSUE, SEE NOTES"] == "SOLAR PV ON ROOF" + ) + elif self.old_format_non_intrusives_present: + existing_solar_non_intrusives_check = ( + self.standardised_asset_list["non-intrusives: WFT Findings"].str.lower().str.strip().isin( + ["solar pv on roof"] + ) + ) + else: + # We don't have an indication + existing_solar_non_intrusives_check = False + + self.standardised_asset_list["property_has_solar"] = ( + (self.standardised_asset_list[self.STANDARD_EXISTING_PV] == "already has PV") | + existing_solar_non_intrusives_check | + (self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR]) + ) + + # If we have non-intrusives completed, we can use this to identify work types + ###################################################### + # Empty cavity: + ###################################################### + # 1) Has been flagged on the non-intrusives as being a cavity wall, empty or partially filled + # 2) The age is before 1995 + # 3) We don't remove anything that haas access issues yet + + if self.non_intrusives_present: + non_intrusives_wall_filter = ( + (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") & + self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) + ) + elif self.old_format_non_intrusives_present: + non_intrusives_wall_filter = ( + self.standardised_asset_list['non-intrusives: WFT Findings'].str.lower().str.strip().isin( + ["empty cavity", "partial fill"] + ) | ( + ( + self.standardised_asset_list['non-intrusives: WFT Findings'] + .str.lower().str.strip().str.contains("empty cavity|partial fill") & + ~self.standardised_asset_list['non-intrusives: WFT Findings'] + .astype(str).str.lower().str.strip().str.contains("major access issues") + ) + ) + ) + else: + # We set the filter to False, as we have no non-intrusives + non_intrusives_wall_filter = False + + if self.landlord_year_built is None: + year_built_filter = self.standardised_asset_list["epc_year_upper_bound"] <= self.EMPTY_CAVITY_YEAR_THRESHOLD + else: + year_built_filter = ( + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= self.EMPTY_CAVITY_YEAR_THRESHOLD) | + (self.standardised_asset_list["epc_year_upper_bound"] <= self.EMPTY_CAVITY_YEAR_THRESHOLD) + ) + + # Criteria: + # The property isn't a bedsit + # Non-intrusives indicate it needs a fill + # The EPC year is before 2002 + # We also flag where the property has solar on the roof, because this is a signal of a high EPC rating + self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = ( + (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & + non_intrusives_wall_filter & + year_built_filter & + ( + ~self.standardised_asset_list["property_has_solar"] + ) + ) + + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_has_solar"] = ( + pd.isnull(self.standardised_asset_list["non_intrusive_indicates_empty_cavity"]) & + (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & + non_intrusives_wall_filter & + year_built_filter & + ( + # If the property has solar, there's a chance it won't qualify + self.standardised_asset_list["property_has_solar"] + ) + ) + + # We also add a filter on anything that was generally identified by the non-intrusives + self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_year_filter"] = ( + pd.isnull(self.standardised_asset_list["non_intrusive_indicates_empty_cavity"]) & + pd.isnull(self.standardised_asset_list["non_intrusive_indicates_empty_cavity_has_solar"]) & + (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) & + non_intrusives_wall_filter + ) + + self.standardised_asset_list["epc_indicates_empty_cavity"] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin( + self.EPC_NO_WALL_INSULATION_DESCRIPTIONS + ) & ( + self.standardised_asset_list["epc_year_upper_bound"] <= self.EMPTY_CAVITY_YEAR_THRESHOLD + ) & ( + ~self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] + ) & ( + ~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"]) + ) + ) + + self.standardised_asset_list["landlord_data_indicates_empty_cavity"] = ( + self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin(["uninsulated cavity"]) & + ( + (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= self.EMPTY_CAVITY_YEAR_THRESHOLD) | + (self.standardised_asset_list["epc_year_upper_bound"] <= self.EMPTY_CAVITY_YEAR_THRESHOLD) + ) & ( + ~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"]) + ) + ) + + # Finally, we create a flag to indicate that the cavity is empty, based on the criteria above + self.standardised_asset_list["cavity_is_empty"] = ( + non_intrusives_wall_filter | + self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin( + self.EPC_NO_WALL_INSULATION_DESCRIPTIONS + ) | + self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin(["uninsulated cavity"]) + ) + + ###################################################### + # Extraction + ###################################################### + # as needing a CIGA check. What is the logic we should be applying here? + + if self.non_intrusives_present: + + extraction_wall_filter = ( + (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") & + (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) & + (~self.standardised_asset_list['non-intrusives: Material'].isin( + ["GREY LOOSE BEAD", "COMPACTED BEAD", "FIBRE BATT NO CAVITY", "EMPTY NARROW BELOW 30mm"] + )) + ) + + if self.non_intrusives_eligibility: + # If we have the eligibility column, we check if the wall is eligible + extraction_wall_filter = ( + extraction_wall_filter & + ~self.standardised_asset_list["non-intrusives: Eligibility (Red/Yellow/Green)"].isin( + ["RED"] + ) + ) + + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( + extraction_wall_filter & year_built_filter + ) + + elif self.old_format_non_intrusives_present: + print("Review these categories!!!!") + extraction_wall_filter = ( + self.standardised_asset_list['non-intrusives: WFT Findings'].str.lower().str.strip().isin( + ["retro drilled", "retro filled", "fibre from build", "polybead", "retro drilled and filled", + "retro drilled & filled", "blown in white wool", "blown in yellow wool"] + ) + ) + + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( + extraction_wall_filter + ) + + else: + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = False + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = False + + ###################################################### + # Solar + ###################################################### + # Criteria: + # Check 1: Does the property have a valid heating system? + self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] = ( + self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( + [ + "air source heat pump", + "ground source heat pump", + "high heat retention storage heaters", + "electric boiler" + ] + ) + ) + self.standardised_asset_list["solar_landlord_data_indicates_needs_heating_upgrade"] = ( + self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin( + ["electric storage heaters", "room heaters", "electric radiators", "no heating"] + ) + ) + + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] = ( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]] + .str.lower().str.contains("air source heat pump|ground source heat pump|boiler and radiators, electric") + ) | ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheat-description"]].str.lower().str.contains( + "electric storage heaters" + ) & ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES[ + "mainheatcont-description"]] == "Controls for high heat retention storage heaters" + ) + ) + ) + + self.standardised_asset_list["solar_epc_data_indicates_requires_heating_upgrade"] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]].str.lower().str.contains( + "electric storage heaters|room heaters" + ) & ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["mainheatcont-description"] + ] != "Controls for high heat retention storage heaters" + ) + ) + + # Basic check - both of the previous two shouldn't be true simultaneously + if ( + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] & + self.standardised_asset_list["solar_epc_data_indicates_requires_heating_upgrade"] + ).sum(): + raise ValueError("Both heating system checks are true - this should not be possible") + + # Check 3: Does the property meet the fabric condition + # Solar PV installs are subject to the minimum insulation requirements which means: + # 1) one of the following insulation measures must be installed as part of the same + # ECO4 project: + # • roof insulation (flat roof, pitched roof, room-in-roof) + # • exterior facing wall insulation (cavity wall, solid wall) + # • party cavity wall insulation + # • floor insulation (solid and underfloor) + # + # OR + # + # all measures (except any exempted measure referred to in paragraph 4.28) + # listed in paragraph a) must already be installed + # + # With this in mind, we look for 2 clases + # 1) The property is fully insulated apart from the loft (<200mm insulation) + # 2) THe property is fully insulated + + print("Should we include cavity properties where they might be uninsulated?") + self.standardised_asset_list["solar_landlord_walls_insulated"] = ( + self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin( + [ + "filled cavity", "insulated solid brick", "insulated timber frame", + ] + ) + ) + + if self.non_intrusives_present: + self.standardised_asset_list["solar_non_intrusives_walls_insulated"] = ( + self.standardised_asset_list["non-intrusives: Insulated"].isin( + ["EWI", "RETRO DRILLED", "FILLED AT BUILD"] + ) + ) + elif self.old_format_non_intrusives_present: + self.standardised_asset_list["solar_non_intrusives_walls_insulated"] = ( + self.standardised_asset_list["non-intrusives: WFT Findings"].str.lower().str.strip().isin( + ["retro drilled", "retro filled", "ewi", "retro drilled/ solid"] + ) + ) + else: + self.standardised_asset_list["solar_non_intrusives_walls_insulated"] = False + + # We merge on the u-value for average thermal transmittance + walls_uvalue_data = pd.DataFrame(cleaned["walls-description"]) + walls_uvalue_data = walls_uvalue_data[ + ~pd.isnull(walls_uvalue_data["thermal_transmittance"]) + ][["original_description", "thermal_transmittance"]].rename( + columns={ + "original_description": self.EPC_API_DATA_NAMES["walls-description"], + "thermal_transmittance": "walls_u_value" + } + ) + self.standardised_asset_list = self.standardised_asset_list.merge( + walls_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["walls-description"] + ) + + self.standardised_asset_list["solar_epc_walls_insulated"] = ( + ( + self.standardised_asset_list[ + self.EPC_API_DATA_NAMES["walls-description"]].str.lower().str.contains( + "|".join(self.EPC_INSULATED_WALLS_SUBSTRINGS) + ) + ) | ( + self.standardised_asset_list["walls_u_value"].apply(lambda x: x <= 0.7 if not pd.isnull(x) else False) + ) + ) + + # We merge on the u-value for average thermal transmittance + roof_roof_data = pd.DataFrame(cleaned["roof-description"])[ + ["original_description", "thermal_transmittance", "is_pitched", "is_loft"] + ].rename( + columns={ + "original_description": self.EPC_API_DATA_NAMES["roof-description"], + "thermal_transmittance": "roof_u_value", + } + ) + + self.standardised_asset_list = self.standardised_asset_list.merge( + roof_roof_data, how="left", on=self.EPC_API_DATA_NAMES["roof-description"] + ) + + # If the u-value of a roof is less than 0.7 we consider it insulated + self.standardised_asset_list["solar_epc_roof_insulated"] = ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["roof-description"]].str.lower().str.contains( + "|".join(self.EPC_INSULATED_ROOF_SUBSTRINGS), + ) | ( + self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( + lambda x: int(x) >= 200 if str(x).isdigit() else False + ) + ) | ( + self.standardised_asset_list["roof_u_value"].apply( + lambda x: x <= 0.7 if not pd.isnull(x) else False + ) + ) + ) + + self.standardised_asset_list["solar_epc_loft_needs_topup"] = ( + self.standardised_asset_list[ + self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply( + lambda x: int(x) < 200 if str(x).isdigit() else False + ) | ( + ( + self.standardised_asset_list["is_loft"] | self.standardised_asset_list["is_pitched"] + ) & ( + self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].isin( + ["below average", "none"] + ) + ) + ) + ) + + self.standardised_asset_list["epc_has_floor_recommendation"] = ( + self.standardised_asset_list["epc_has_floor_recommendation"].fillna(False) + ) + + # Check if the boiler is electric + # We check if it contains both the terms boiler & electric + self.standardised_asset_list["has_electric_boiler"] = ( + ( + self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]] + .str.lower().isin( + ["boiler and radiators, electric"]) + ) | ( + self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM] == "electric boiler" + ) + ) + + #################################### + # Check solar eligibility + #################################### + + # Set up the filters to stop repetition + correct_heating_system = ( + self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] | + self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] | + self.standardised_asset_list["has_electric_boiler"] + ) + + needs_heating_upgrade = ( + self.standardised_asset_list["solar_landlord_data_indicates_needs_heating_upgrade"] | + self.standardised_asset_list["solar_epc_data_indicates_requires_heating_upgrade"] + ) + + # The requirements for walls are: + # 1) walls are insulated + # 2) property is a cavity (can be done insulated or not) + + walls_meet_solar_requirements = ( + # The landlord is saying the walls are insulated + self.standardised_asset_list["solar_landlord_walls_insulated"] | + # EPC data is saying the walls are insulated + self.standardised_asset_list["solar_epc_walls_insulated"] | + # Non-intrusives are saying the walls are insulated + self.standardised_asset_list["solar_non_intrusives_walls_insulated"] | + # It's empty cavity + self.standardised_asset_list["cavity_is_empty"] | + # It's a cavity wall + (self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].str.contains("cavity")) + ) + + not_a_flat = ( + self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "flat" + ) + + solar_roof_meets_criteria = ( + self.standardised_asset_list["solar_epc_roof_insulated"] | + self.standardised_asset_list["solar_epc_loft_needs_topup"] + ) + + self.standardised_asset_list["solar_eligible"] = ( + # Property isn't a flag + not_a_flat & + # Landlord data or EPC data indicates the heating system is appropriate + correct_heating_system & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + walls_meet_solar_requirements & + # Roof meets criteria + solar_roof_meets_criteria + ) + + # With heating upgrade + self.standardised_asset_list["solar_eligible_needs_heating_upgrade"] = ( + not_a_flat & + # Needs heating upgrade + needs_heating_upgrade & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are insulated + walls_meet_solar_requirements & + # Roof meets criteria + solar_roof_meets_criteria + ) + + # We shouldn't have an overlap + if ( + self.standardised_asset_list["solar_eligible"] & + self.standardised_asset_list["solar_eligible_needs_heating_upgrade"] + ).sum(): + raise ValueError("Both heating upgrade and no heating upgrade are true - this should not be possible") + + # We check for a specific sub-set of properties which are uninsulated solid wall properties that are EPC E + # or below (we'll use 57 as a threshold) - These are for a pilot with Net Zero Renewables + self.standardised_asset_list["solar_eligible_solid_wall_uninsulated"] = ( + not_a_flat & + # Landlord data or EPC data indicates the heating system is appropriate - in this case, we can also take + # electric boilers + correct_heating_system & + # The property doesn't currently have solar + ~self.standardised_asset_list["property_has_solar"] & + # The walls are uninsulated solid + ~walls_meet_solar_requirements & + (self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= 57) + ) + + # Drop anything we don't need + self.standardised_asset_list = self.standardised_asset_list.drop( + columns=["walls_u_value", "roof_u_value"] + ) + + # Adjust flagged extraction jobs to remove anything for solar + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = ( + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] & + ~self.standardised_asset_list["solar_eligible"] + ) + + # Finally, we note why each property has been flagged + self.standardised_asset_list["cavity_reason"] = None + + empty_cavity_map = { + "non_intrusive_indicates_empty_cavity": "Non-Intrusive Data Shows Empty Cavity: ", + "non_intrusive_indicates_empty_cavity_has_solar": "Non-Intrusive Data Shows Empty Cavity - property " + "already has solar: ", + "non_intrusive_indicates_empty_cavity_no_year_filter": f"Non-Intrusive Data Shows Empty Cavity, " + f"built after {self.EMPTY_CAVITY_YEAR_THRESHOLD}: ", + + } + for variable, description in empty_cavity_map.items(): + self.standardised_asset_list["cavity_reason"] = np.where( + self.standardised_asset_list[variable] & + pd.isnull(self.standardised_asset_list["cavity_reason"]), + description + self.standardised_asset_list["SAP Category"], + self.standardised_asset_list["cavity_reason"] + ) + + # We break the cavity reason into a few different categories, when the EPC is different from inspections + self.standardised_asset_list["cavity_reason"] = np.where( + ( + self.standardised_asset_list["epc_indicates_empty_cavity"] & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & + (self.standardised_asset_list['non-intrusives: Insulated'] == "RETRO DRILLED") & + pd.isnull(self.standardised_asset_list["cavity_reason"]) + ), + "EPC Shows Empty Cavity, inspections show retro drilled: " + self.standardised_asset_list["SAP Category"], + self.standardised_asset_list["cavity_reason"] + ) + + self.standardised_asset_list["cavity_reason"] = np.where( + ( + self.standardised_asset_list["epc_indicates_empty_cavity"] & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & + (self.standardised_asset_list['non-intrusives: Insulated'] == "FILLED AT BUILD") & + pd.isnull(self.standardised_asset_list["cavity_reason"]) + ), + "EPC Shows Empty Cavity, inspections show filled at build: " + self.standardised_asset_list["SAP Category"], + self.standardised_asset_list["cavity_reason"] + ) + + self.standardised_asset_list["cavity_reason"] = np.where( + ( + self.standardised_asset_list["epc_indicates_empty_cavity"] & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & + pd.isnull(self.standardised_asset_list["cavity_reason"]) + ), + "EPC Shows Empty Cavity, inspections show non-cavity build: " + self.standardised_asset_list[ + "SAP Category"], + self.standardised_asset_list["cavity_reason"] + ) + # Landlord data: The landlord's data indicates that the wall is an uninsulated cavity wall, but EPC and + # inspections show filled + self.standardised_asset_list["cavity_reason"] = np.where( + ( + self.standardised_asset_list["landlord_data_indicates_empty_cavity"] & + ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] & + ~self.standardised_asset_list["epc_indicates_empty_cavity"] & + pd.isnull(self.standardised_asset_list["cavity_reason"]) + ), + "Landlord Data Shows Empty Cavity, EPC & Inspections Shows Filled: " + self.standardised_asset_list[ + "SAP Category"], + self.standardised_asset_list["cavity_reason"] + ) + + # Flag extraction + self.standardised_asset_list["cavity_reason"] = np.where( + ( + self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] & + pd.isnull(self.standardised_asset_list["cavity_reason"]) + ), + "Non-Intrusive Data Shows Cavity Extraction: " + self.standardised_asset_list["SAP Category"], + self.standardised_asset_list["cavity_reason"] + ) + + ###################################################### + # Flag solar + ###################################################### + self.standardised_asset_list["solar_reason"] = None + + # Map of variables and fill values for the solar_reason variable + solar_reason_map = { + "solar_eligible": "Solar Eligible: ", + "solar_eligible_needs_heating_upgrade": ( + "Solar Eligible, Solid Floor, Needs Heating Upgrade: " + ), + "solar_eligible_solid_wall_uninsulated": "Solar Eligible, Solid Wall Uninsulated, EPC E or Below: ", + } + + for variable, reason in solar_reason_map.items(): + self.standardised_asset_list["solar_reason"] = np.where( + self.standardised_asset_list[variable], + reason + self.standardised_asset_list["SAP Category"], + self.standardised_asset_list["solar_reason"] + ) + + # Flag anything that has existing outcomes + if (self.outcomes is not None) and ("Surveyed" in self.standardised_asset_list.columns): + + if "Installer Refusal" not in self.standardised_asset_list.columns: + self.standardised_asset_list["cavity_reason"] = np.where( + ( + (self.standardised_asset_list["Surveyed"] > 0) + ), + None, + self.standardised_asset_list["cavity_reason"] + ) + else: + self.standardised_asset_list["cavity_reason"] = np.where( + ( + (self.standardised_asset_list["Surveyed"] > 0) | + (self.standardised_asset_list["Installer Refusal"] > 0) + ), + None, + self.standardised_asset_list["cavity_reason"] + ) + + if self.master_surveyed is not None: + self.standardised_asset_list["cavity_reason"] = np.where( + ( + (~pd.isnull(self.standardised_asset_list["submission_date"])) + ), + None, + self.standardised_asset_list["cavity_reason"] + ) + + blocks_of_flats = self.standardised_asset_list[ + self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats" + ] + + non_blocks_of_flats = self.standardised_asset_list[ + self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats" + ] + + # Produce some aggregate figures + self.work_type_figures = { + **non_blocks_of_flats["cavity_reason"].value_counts().to_dict(), + **{ + k + " (Block of flats)": v for k, v in + blocks_of_flats["solar_reason"].value_counts().to_dict().items() + }, + **self.standardised_asset_list["solar_reason"].value_counts().to_dict() + } + + # We prepare outcomes for output + if self.outcomes is not None: + logger.info("Preparing outcomes for output") + identified_work = self.standardised_asset_list[ + ~pd.isnull(self.standardised_asset_list["cavity_reason"]) | + ~pd.isnull(self.standardised_asset_list["solar_reason"]) + ][self.DOMNA_PROPERTY_ID].values + + if self.DOMNA_PROPERTY_ID in self.outcomes.columns: + self.outcomes_for_output = self.outcomes[ + self.outcomes[self.DOMNA_PROPERTY_ID].isin(identified_work) + ] + + def flat_analysis(self): + + # We need to deduce the building name - we strip out the house number + + # We want to deduce if flats have 50% of the properties below C75 + # We group by postcode and property type + grouped = self.standardised_asset_list.groupby( + [self.STANDARD_POSTCODE, self.STANDARD_PROPERTY_TYPE] + ) + + flat_data = [] + for _, group in grouped: + if "flat" in group[self.STANDARD_PROPERTY_TYPE].values: + num_flats = group[self.STANDARD_PROPERTY_TYPE].shape[0] + num_below_c75 = group[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"] + ].lt(self.FILLED_CAVITY_SAP_THRESHOLD).sum() + # Check if any flats are below C69 + num_flats_below_c69 = group[ + self.EPC_API_DATA_NAMES["current-energy-efficiency"] + ].lt(69).sum() + + flat_data.append( + { + "Postcode": group[self.STANDARD_POSTCODE].iloc[0], + "Property Type": "Flat", + "Number of Flats with EPC": num_flats, + "Number of Flats below C75": num_below_c75, + "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats), + "Number of Flats Below C69": num_flats_below_c69, + } + ) + + flat_data = pd.DataFrame(flat_data) + + self.flat_data = flat_data + + @staticmethod + def split_full_name(x): + if pd.isnull(x): + return None, None, None + x = x.lower() + titles = ["mr", "mrs", "ms", "miss", "dr", "prof"] + # Remove titles + detected_title = [title for title in titles if x.startswith(title)] + if detected_title: + for title in detected_title: + x = x.replace(title, "") + x = x.strip() + first_name, last_name = x.split(" ")[0], x.split(" ")[-1] + title = detected_title[0].title() if detected_title else None + return title, first_name.title(), last_name.title() + + def load_contact_details( + self, + local_filepath, + sheet_name, + landlord_property_id, + phone_number_column=None, + email_column=None, + fullname_column=None, + firstname_column=None, + lastname_column=None + ): + + self.contact_detail_fields = { + "landlord_property_id": landlord_property_id, + "phone_number": phone_number_column, + "email": email_column, + "fullname": fullname_column, + "firstname": firstname_column, + "lastname": lastname_column + } + + details_colnames = [ + phone_number_column, email_column, fullname_column, firstname_column, lastname_column + ] + # We'll fill them + none_details = [x for x in details_colnames if x is None] + details_colnames = [x for x in details_colnames if x is not None] + + contact_details = pd.read_excel( + local_filepath, sheet_name=sheet_name + )[[self.contact_detail_fields["landlord_property_id"]] + details_colnames] + contact_details = contact_details[ + ~pd.isnull(contact_details[self.contact_detail_fields["landlord_property_id"]]) + ] + # Fill anything we don't have + for detail in none_details: + contact_details[detail] = None + + if fullname_column and not (firstname_column and lastname_column): + contact_details["title"], contact_details["first_name"], contact_details["last_name"] = zip( + *contact_details[fullname_column].apply(self.split_full_name) + ) + else: + raise NotImplementedError("Implement me") + + self.contact_details = contact_details + + def prepare_for_crm(self, company_domain, crm_pipeline_name, first_dealstage, assigned_surveyors): + """ + This function prepares the data for upload into Hubspot + :return: + """ + # This is a placeholder for now + + # This maps the opportunities as we reference them, to the product data as stored in Hubspot + product_lookup_table = { + "Non-Intrusive Data Showed Cavity Extraction": { + "name": "Extract & Fill - ECO4", "id": 100307905778, "unit_price": 500 + }, + "Non-Intrusive Data Showed Empty Cavity": { + "name": "Empty Cavity & Loft - ECO4", "id": 82733738177, "unit_price": 1000 + }, + "Non-Intrusive Data Showed Empty Cavity but all SAP scores allowed": { + "name": "Empty Cavity & Loft - ECO4", "id": 82733738177, "unit_price": 1000 + }, + "Non-Intrusive Data Showed Cavity Extraction but all SAP scores allowed": { + "name": "Extract & Fill - ECO4", "id": 100307905778, "unit_price": 500 + }, + "EPC Data Showed Empty Cavity": { + "name": "Empty Cavity & Loft - ECO4", "id": 82733738177, "unit_price": 1000 + }, + "Solid Floor, Insulated, No Solar": { + "name": "Solar PV - ECO4", "id": 82623589564, "unit_price": 1608 + }, + "Solid Floor, Insulated, Needs Loft": { + "name": "Solar PV - ECO4", "id": 82623589564, "unit_price": 1608 + }, + "Other Floor, Insulated, No Solar": { + "name": "Solar PV - ECO4", "id": 82623589564, "unit_price": 1608 + }, + "Other Floor, Insulated, Needs Loft": { + "name": "Solar PV - ECO4", "id": 82623589564, "unit_price": 1608 + } + } + # We check if all products are covered in the lookup table + cavity_products = self.standardised_asset_list["cavity_reason"].unique() + solar_products = self.standardised_asset_list["solar_reason"].unique() + # Check if there any options not in out lookup table + if ( + any(x for x in cavity_products if x not in product_lookup_table) or + any(x for x in solar_products if x not in product_lookup_table) + ): + raise ValueError("We have products not referenced in the lookup table - check this") + + programme_data = self.standardised_asset_list.copy() + + # Exclusions - these are properties we won't treat for the moment + product_exclusions = [ + "Other Floor, Insulated, No Solar", + "Other Floor, Insulated, Needs Loft" + ] + if product_exclusions: + logger.warning("Excluding products: %s", product_exclusions) + + programme_data = programme_data[programme_data["solar_reason"].isin(product_exclusions) == False] + + # Merge on the contact details + programme_data = programme_data.merge( + self.contact_details, + how="left", + left_on=self.STANDARD_LANDLORD_PROPERTY_ID, + right_on=self.landlord_property_id, + ) + + programme_data["Company Domain Name "] = company_domain + # Append the product data onto the programme data + programme_data["cavity_product"] = programme_data["cavity_reason"].map( + lambda x: product_lookup_table.get(x, {"name": None})["name"] + ) + programme_data["solar_product"] = programme_data["solar_reason"].map( + lambda x: product_lookup_table.get(x, {"name": None})["name"] + ) + + programme_data["domna_product"] = programme_data["solar_reason"].copy() + programme_data["domna_product"] = np.where( + pd.isnull(programme_data["domna_product"]), + programme_data["solar_product"], + programme_data["domna_product"] + ) + # We filter just on rows where we have a product + programme_data = programme_data[ + ~pd.isnull(programme_data["domna_product"]) + ] + programme_data = programme_data.drop(columns=["solar_product", "cavity_product"]) + + product_df = ( + pd.DataFrame(product_lookup_table).T[["name", "id", "unit_price"]] + .reset_index() + .rename( + columns={ + "name": "Name ", + "id": 'Product ID ', + "unit_price": 'Unit price ', + "index": "domna_product" + } + ) + ) + + product_df['Quantity '] = 1 + + # Append on the product data + programme_data = programme_data.merge( + product_df, + how="left", + on="domna_product", + ) + + # Add in deal and pipeline information + programme_data["dealname"] = programme_data[self.STANDARD_FULL_ADDRESS] + " : " + programme_data[ + "domna_product"] + programme_data['Pipeline '] = crm_pipeline_name + programme_data['Deal Stage '] = first_dealstage + programme_data['Associations: Listing'] = "Property Owner" + + programme_data = programme_data.merge( + assigned_surveyors.rename( + columns={self.landlord_property_id: self.STANDARD_LANDLORD_PROPERTY_ID} + ), how="left", on=self.STANDARD_LANDLORD_PROPERTY_ID + ) + + # This maps the hubspot schema to the template. Anything that is not covered in this will be flagged + schema_mappings = { + 'Name ': self.DOMNA_PROPERTY_ID, # TODO: Maybe change this? + 'Company Domain Name ': 'Company Domain Name ', + 'Email ': ( + self.contact_detail_fields["email"] if self.contact_detail_fields["email"] else None + ), # TODO: Review + 'First Name ': ( + self.contact_detail_fields["firstname"] if self.contact_detail_fields["firstname"] else None + ), # TODO: Review + 'Last Name ': ( + self.contact_detail_fields["lastname"] if self.contact_detail_fields["lastname"] else None + ), # TODO: Review + 'Phone ': ( + self.contact_detail_fields["phone_number"] if self.contact_detail_fields["phone_number"] else None + ), # TODO: Review + 'Full Address ': self.STANDARD_FULL_ADDRESS, + 'Address 1 ': self.STANDARD_ADDRESS_1, + 'Address 2 ': None, # TODO: Don't have this for the moment + 'Postcode ': self.STANDARD_POSTCODE, + 'Property Type ': self.STANDARD_PROPERTY_TYPE, + 'Property Sub Type ': None, # TODO: Don't have this for the moment + 'Bedroom(s) ': None, # TODO: Don't have this for the moment + 'Domna Property ID ': self.DOMNA_PROPERTY_ID, + 'National UPRN ': ( + self.STANDARD_UPRN if self.STANDARD_UPRN is not None else self.EPC_API_DATA_NAMES["uprn"] + ), + 'Owner Property ID ': self.STANDARD_LANDLORD_PROPERTY_ID, + 'Wall Construction ': self.STANDARD_WALL_CONSTRUCTION, + 'Heating System ': self.STANDARD_HEATING_SYSTEM, + 'Year Built ': self.STANDARD_YEAR_BUILT, + 'Boiler Make ': None, # TODO: Don't have this for the moment + 'Boiler Model ': None, # TODO: Don't have this for the moment + 'Non-Intrusives: Date Checked ': None, + # TODO: Don't have this for the moment + 'Non-Intrusives: Wall Type ': ( + "non-intrusives: Construction" if self.non_intrusives_present else None + ), + 'Non-intrusives: Insulation ': ( + "non-intrusives: Insulated" if self.non_intrusives_present else None + ), + 'Non-intrusives: Insulation Material ': ( + "non-intrusives: Material" if self.non_intrusives_present else None + ), + 'Non-Intrusives: CIGA Check Required ': ( + 'non-intrusives: CIGA Check Required' if self.non_intrusives_present else None + ), + 'Non-Intrusives: PV Access Issues ': ( + 'non-intrusives: PV, ACCESS ISSUE, SEE NOTES' if self.non_intrusives_present else None + ), + 'Non-Intrusives: Roof Orientation ': ( + 'non-intrusives: OFF GAS - ROOF ORIENTATION' if self.non_intrusives_present else None + ), + 'Non-Intrusives: Surveyor Notes ': ( + 'non-intrusives: Any further surveyor notes' if self.non_intrusives_present else None + ), + 'Non-Intrusives: Surveyor Name ': ( + 'non-intrusives: Surveyors Name' if self.non_intrusives_present else None + ), + 'CIGA: Date Requested ': None, # TODO: Don't have this for the moment + 'CIGA: Cavity Guarantee Found ': None, + 'Last EPC: Is Estimated ': self.EPC_API_DATA_NAMES["estimated"], + 'Last EPC: EPC Rating ': self.EPC_API_DATA_NAMES["current-energy-rating"], + 'Last EPC: SAP Rating ': self.EPC_API_DATA_NAMES["current-energy-efficiency"], + 'Last EPC: Main Heating Description ': self.EPC_API_DATA_NAMES[ + "mainheat-description"], + 'Last EPC: Heating Controls ': self.EPC_API_DATA_NAMES[ + "mainheatcont-description"], + 'Last EPC: Lodgement Date ': self.EPC_API_DATA_NAMES["inspection-date"], + 'Last EPC: Floor Area ': self.EPC_API_DATA_NAMES["total-floor-area"], + 'Last EPC: Wall ': self.EPC_API_DATA_NAMES["walls-description"], + 'Last EPC: Roof ': self.EPC_API_DATA_NAMES["roof-description"], + 'Last EPC: Floor ': self.EPC_API_DATA_NAMES["floor-description"], + 'Last EPC: Room Height ': self.EPC_API_DATA_NAMES["floor-height"], + 'Last EPC: Age Band ': self.EPC_API_DATA_NAMES["construction-age-band"], + 'Deal Stage ': 'Deal Stage ', + 'Pipeline ': 'Pipeline ', + 'Expected Commencement Date ': None, # TODO: Need to set this, + 'Deal Name ': "dealname", # Need to create this, + 'Product ID ': 'Product ID ', + 'Name ': 'Name ', + 'Unit price ': 'Unit price ', + 'Quantity ': 'Quantity ', + 'Deal Owner': 'surveyor_email', + 'Amount ': 'Unit price ', + } + + # We now create the finalised dataset to be uploaded into Hubspot + variables_required = list(schema_mappings.values()) + variables_required = [v for v in variables_required if v is not None] + # We now flag anything that has a none value, which is information we haven't got right now + none_variables = [k for k, v in schema_mappings.items() if v is None] + # We'll add placeholder columns for the None variables + programme_data = programme_data[variables_required] + for col in none_variables: + programme_data[col] = None + + programme_data = programme_data.rename( + columns={v: k for k, v in schema_mappings.items() if v is not None} + ) + + self.hubspot_data = programme_data + + def flag_outcomes( + self, + outcomes_filepath, + outcomes_sheetname, + outcomes_address, + outcomes_postcode, + outcomes_houseno, + outcomes_id + ): + if outcomes_filepath is None: + return + + self.outcomes = pd.read_excel(outcomes_filepath, sheet_name=outcomes_sheetname) + self.outcomes["row_id"] = self.outcomes.index + + if outcomes_houseno is None: + outcomes_houseno = "houseno" + self.outcomes["houseno"] = self.outcomes[outcomes_address].apply( + lambda x: SearchEpc.get_house_number(x, self.outcomes[outcomes_postcode]) + ) + + logger.info("Matching outcomes to asset list") + # Merge the outcomes onto the asset list - we check we're able to match sufficiently well + lookup = [] + nomatch = [] + for _, x in tqdm(self.outcomes.iterrows(), total=len(self.outcomes)): + + if pd.isnull(x[outcomes_address]): + continue + + # Check if we have an id + oid = x[outcomes_id] if outcomes_id is not None else None + + if oid is not None: + matched = self.standardised_asset_list[ + (self.standardised_asset_list[ + self.STANDARD_LANDLORD_PROPERTY_ID + ].str.strip() == oid) + ] + + if matched.shape[0] == 1: + lookup.append( + { + "row_id": x["row_id"], + self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0] + } + ) + continue + + address_clean = x[outcomes_address].lower().replace(",", "").replace(" ", " ") + + self.outcomes["Outcome"] = self.outcomes["Outcome"].str.lower() + + matched = self.standardised_asset_list[ + (self.standardised_asset_list[ + self.STANDARD_FULL_ADDRESS + ].str.lower().str.replace(",", "").str.replace(" ", " ") == address_clean) + ] + + if matched.shape[0] == 1: + lookup.append( + { + "row_id": x["row_id"], + self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0] + } + ) + continue + + matched = self.standardised_asset_list[ + (self.standardised_asset_list[self.STANDARD_POSTCODE].str.strip() == x[outcomes_postcode]) + ].copy() + if not matched.empty: + matched["houseno"] = matched.apply( + lambda x: SearchEpc.get_house_number( + str(x[self.STANDARD_ADDRESS_1]), str(x[self.STANDARD_POSTCODE]) + ), + axis=1 + ) + + matched = matched[ + matched["houseno"].astype(str) == str(x[outcomes_houseno]) + ] + if matched.shape[0] == 1: + lookup.append( + { + "row_id": x["row_id"], + self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0] + } + ) + continue + elif not matched.empty: + # Use levenstein distance to match + matched["address"] = matched[self.STANDARD_ADDRESS_1] + " " + matched[self.STANDARD_POSTCODE] + + best_match = process.extractOne(x["Address"], matched[self.STANDARD_FULL_ADDRESS].values)[0] + matched = matched[matched[self.STANDARD_FULL_ADDRESS] == best_match] + lookup.append( + { + "row_id": x["row_id"], + self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0] + } + ) + continue + + nomatch.append(x["row_id"]) + + self.outcomes_no_match = self.outcomes[self.outcomes["row_id"].isin(nomatch)] + lookup = pd.DataFrame(lookup) + + if lookup.empty: + return + + # We will have duplicated domna property IDs, where a surveyor has been to a property multiple times + # Where we have multiple rows, we want to make a call on what the action should be. For example, + # there may be properties that have been visited multiple times where the outcome was "See notes" implying + # that the surveyor had a detailed explanation as to why they couldn't gain access so if this has + # happened multiple times, in this case we judge that the work may not be viable + + date_col = "Week Commencing" if "Week Commencing" in self.outcomes else "Survey Date" + + lookup = lookup.merge( + self.outcomes[["row_id", "Outcome", "Notes", date_col]], how="left", on="row_id" + ) + + visit_counts = ( + lookup.groupby(self.DOMNA_PROPERTY_ID)["row_id"] + .count() + .reset_index() + .rename(columns={"row_id": "visit_count"}) + .sort_values("visit_count", ascending=False) + ) + + pivot_df = lookup.groupby(["domna_property_id", "Outcome"]).size().unstack(fill_value=0).reset_index() + pivot_df = pivot_df.merge( + visit_counts, how="left", on="domna_property_id" + ) + + if pivot_df[self.DOMNA_PROPERTY_ID].duplicated().sum(): + raise Exception("We have duplicated property IDs in the outcomes data") + + # We merge this data onto outcomes + self.outcomes["matched_to_asset_list"] = self.outcomes["row_id"].isin(lookup["row_id"].values) + self.outcomes = self.outcomes.merge(lookup[["row_id", "domna_property_id"]], how="left", on="row_id") + + # We merge out pivoted outcomes onto the asset list + self.standardised_asset_list = self.standardised_asset_list.merge( + pivot_df, how="left", left_on=self.DOMNA_PROPERTY_ID, right_on="domna_property_id" + ) + + self.outcomes = self.outcomes.sort_values("domna_property_id", ascending=False) + + def flag_survey_master( + self, + master_filepaths, + master_to_asset_list_filepath=None + ): + # TODO: This probably needs further expansion + + if not master_filepaths: + return + + if master_to_asset_list_filepath is not None: + id_map = pd.read_csv(master_to_asset_list_filepath) + else: + id_map = pd.DataFrame() + + logger.info("Getting masters and merging onto asset list") + master_surveyed = [] + unmatched_submissions = [] + for filepath in master_filepaths: + master_data = pd.read_csv(filepath) + # Strip columns + master_data.columns = [c.strip() for c in master_data.columns] + + if not id_map.empty: + master_data = master_data.merge( + id_map, how="left", on=['NO.', 'Street / Block Name', 'Post Code'] + ) + + install_col = ( + "INSTALLED OR CANCELLED" if "INSTALLED OR CANCELLED" in master_data.columns + else "INSTALL / CANCELLATION DATE" + ) + + submission_col = ( + "SUBMISSION DATE" if "SUBMISSION DATE" in master_data.columns else "SUBMISSION DATE TO INSTALLERS" + ) + + if "UPRN" in master_data.columns: + # We just need to check if any were cancelled + master_to_append = master_data[ + ["UPRN", install_col, submission_col] + ].rename( + columns={ + "UPRN": self.STANDARD_LANDLORD_PROPERTY_ID, + install_col: "survey_status", + submission_col: "submission_date" + } + ) + master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel") + + master_surveyed.append(master_to_append) + continue + + master_data["row_id"] = master_data.index + + self.standardised_asset_list["house_no"] = self.standardised_asset_list.apply( + lambda x: SearchEpc.get_house_number( + str(x[self.STANDARD_ADDRESS_1]), str(x[self.STANDARD_POSTCODE]) + ), + axis=1 + ) + + postcode_col = "POSTCODE" if "POSTCODE" in master_data.columns else "Post Code" + house_no_col = 'NO.' if 'NO.' in master_data.columns else "NO" + + # Otherwise, we need to match algorithmically + logger.info("Matching master data to asset list") + matched = [] + unmatched = [] + for _, row in tqdm(master_data.iterrows(), total=len(master_data)): + if pd.isnull(row[postcode_col]): + continue + postcode_no_space = row[postcode_col].strip().replace(" ", "").lower() + + df = self.standardised_asset_list[ + ( + self.standardised_asset_list[self.STANDARD_POSTCODE].str.strip().str.lower().str.replace(" ", + "") + == postcode_no_space + ) + ] + + house_no = row[house_no_col] + + if house_no in df["house_no"].values: + df = df[df["house_no"] == house_no] + if df.shape[0] != 1: + # Levenstein distance + + if any(df[self.STANDARD_FULL_ADDRESS].str.contains(row["Street / Block Name"])): + df = df[ + df[self.STANDARD_FULL_ADDRESS].str.contains(row["Street / Block Name"]) + ] + else: + # Levenstein distance + df = df[ + df[self.STANDARD_FULL_ADDRESS].str.lower().apply( + lambda x: process.extractOne( + " ".join([row[house_no_col], row["Street / Block Name"], row["TOWN"]]).lower(), + x + )[1] + ) > 90 + ] + + if df.shape[0] == 0: + unmatched.append(row["row_id"]) + continue + + if any(df[self.STANDARD_FULL_ADDRESS].str.lower().str.contains( + " ".join([row[house_no_col], row["Street / Block Name"]]).lower() + )): + df = df[ + df[self.STANDARD_FULL_ADDRESS].str.lower().str.contains( + " ".join([row[house_no_col], row["Street / Block Name"]]).lower() + ) + ] + + if any( + df[self.STANDARD_PROPERTY_TYPE].str.contains( + row["PROPERTY TYPE As per table emailed"].split(" ")[-1].lower() + ) + ): + # We ignore "block of flats" entries + df = df[ + df[self.STANDARD_PROPERTY_TYPE].str.contains( + row["PROPERTY TYPE As per table emailed"].split(" ")[-1].lower() + ) & (df[self.STANDARD_PROPERTY_TYPE] != "block of flats") + ] + + if df.shape[0] != 1: + # We have multiple matches + raise NotImplementedError("FIX ME") + matched.append( + { + "row_id": row["row_id"], + self.STANDARD_LANDLORD_PROPERTY_ID: df[self.STANDARD_LANDLORD_PROPERTY_ID].values[0], + } + ) + + self.standardised_asset_list = self.standardised_asset_list.drop(columns="house_no") + + # We match the "UPRN" which is the landlords ID, onto the master sheet + matched = pd.DataFrame(matched) + master_to_append = master_data[["row_id", install_col, submission_col]].merge( + matched, how="left", on="row_id" + ).rename( + columns={ + install_col: "survey_status", + submission_col: "submission_date" + } + ) + master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel") + master_surveyed.append(master_to_append) + unmatched_df = master_data[ + master_data["row_id"].isin(unmatched) + ] + + scheme_col = ( + "AFFORDABLE WARMTH OR EPC FOR HOUSING ASSOCIATION" if + "AFFORDABLE WARMTH OR EPC FOR HOUSING ASSOCIATION" in master_data.columns else "AFFORDABLE WARMTH" + ) + # The columns are massively different - we take just a few + unmatched_df = unmatched_df[ + [ + scheme_col, house_no_col, "Street / Block Name", postcode_col, install_col, submission_col + ] + ].rename( + columns={ + scheme_col: "Funding Scheme", + house_no_col: "House Number", + postcode_col: "Postcode", + install_col: "survey_status", + submission_col: "submission_date" + } + ) + + unmatched_submissions.append(unmatched_df) + + master_surveyed = pd.concat(master_surveyed) + master_surveyed = master_surveyed[~pd.isnull(master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID])] + master_surveyed = master_surveyed[ + ~master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID].isin( + ["NOT ON ASSET LIST", "Missing From Asset List"] + ) + ] + + master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID] = master_surveyed[ + self.STANDARD_LANDLORD_PROPERTY_ID + ].astype(str) + + # We de-dupe crudely on landlord property id + self.master_surveyed = master_surveyed.drop_duplicates(subset=[self.STANDARD_LANDLORD_PROPERTY_ID]) + + self.standardised_asset_list = self.standardised_asset_list.merge( + self.master_surveyed, how="left", on=self.STANDARD_LANDLORD_PROPERTY_ID + ) + + # Finally, we keep a record of the unmatched + if unmatched_submissions: + self.unmatched_submissions = pd.concat( + unmatched_submissions + ) diff --git a/asset_list/DataMapper.py b/asset_list/DataMapper.py new file mode 100644 index 00000000..ac1b8db3 --- /dev/null +++ b/asset_list/DataMapper.py @@ -0,0 +1,178 @@ +# OpenAI API Key (set this in your environment variables for security) +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") + + +class DataRemapper: + def __init__(self, standard_values, standard_map=None, max_tokens=1000): + """ + Initialize the remapper with standard values and a predefined mapping. + + :param standard_values: Set of allowed standardized values. + :param standard_map: Dictionary of common remappings {raw_value: standard_value}. + """ + self.standard_values = standard_values + self.standard_map = standard_map + self.fuzzy_threshold = 90 # Adjust fuzzy matching sensitivity + self.ai_model = "gpt-4-turbo" # Use gpt-3.5-turbo for cheaper processing + + # Tokenizer for counting tokens + self.tokenizer = tiktoken.encoding_for_model(self.ai_model) + + # Track token usage and remap dictionary + self.total_tokens_used = 0 + self.total_cost = 0 + self.remap_dict = {} # {original_value: standardized_value} + self.max_tokens = max_tokens # Limit for OpenAI API + + # Memoization for AI calls + self.ai_cache = {} # {tuple(unmapped_values): {original_value: standardized_value}} + # Capture the reponse for debugging + self.ai_response = None + + # OpenAI pricing (as of Feb 2024) + self.pricing = { + "gpt-4-turbo": {"input": 0.01 / 1000, "output": 0.03 / 1000}, + "gpt-3.5-turbo": {"input": 0.0015 / 1000, "output": 0.002 / 1000}, + } + + self.openai_client = OpenAI(api_key=OPENAI_API_KEY) + + @staticmethod + def clean_string(text): + """Basic text cleaning: remove extra spaces, punctuation, and normalize case.""" + if not isinstance(text, str): + return None + text = text.strip().lower() + text = re.sub(r'[^\w\s]', '', text) # Remove punctuation + # Replace double strings + text = re.sub(r'\s+', ' ', text) + return text + + def fuzzy_match(self, text): + """Use fuzzy matching to find the closest standard value.""" + match, score = process.extractOne(text, self.standard_values) if text else (None, 0) + return match if score >= self.fuzzy_threshold else None + + def count_tokens(self, text): + """Estimate the number of tokens in a given text.""" + return len(self.tokenizer.encode(text)) if text else 0 + + def ai_standardize(self, unmapped_values): + """Call OpenAI API **once** for all unmapped values to minimize cost, with memoization.""" + if not unmapped_values: + return {} + + unmapped_tuple = tuple(sorted(unmapped_values)) # Ensure consistency for memoization + if unmapped_tuple in self.ai_cache: + return self.ai_cache[unmapped_tuple] # Return memoized result + + prompt = f""" + You are an expert in data classification. Standardize each of these values into one of the categories: + {list(self.standard_values)}. + + Return only a JSON dictionary where: + - The keys are the original values. + - The values are the standardized ones. + + Strictly return JSON **without markdown formatting** or extra text. + + Example Output: + {{ + "BLKHOUS": "block house", + "BEDSIT": "bedsit" + }} + + Values to standardize: + {unmapped_values} + """ + + # Count input tokens + input_tokens = self.count_tokens(prompt) + if input_tokens > self.max_tokens: + raise ValueError("Input tokens exceed the maximum limit.") + + logger.info("Calling OpenAI API for standardization...") + response = self.openai_client.chat.completions.create( + model=self.ai_model, + messages=[{"role": "user", "content": prompt}], + max_tokens=self.max_tokens, + temperature=0.1, + ) + + output_text = response.choices[0].message.content.strip() + output_tokens = self.count_tokens(output_text) # Count output tokens + + # Track total token usage + self.total_tokens_used += input_tokens + output_tokens + + # Estimate cost + input_cost = input_tokens * self.pricing[self.ai_model]["input"] + output_cost = output_tokens * self.pricing[self.ai_model]["output"] + self.total_cost += input_cost + output_cost + + try: + # Parse response as dictionary + mapping = eval(output_text) # OpenAI should return a valid dictionary + except: + mapping = {val: "unknown" for val in unmapped_values} # Fallback + + # Memoize the AI response + self.ai_cache[unmapped_tuple] = mapping + # We store the raw AI response for debugging + logger.debug(f"AI Response: {mapping}") + self.ai_response = output_text + + return mapping + + def standardize_list(self, values_to_remap): + """ + Standardizes a list of values and returns a dictionary {original_value: standardized_value}. + + :param values_to_remap: List of raw values to standardize. + :return: Dictionary {original_value: standardized_value}. + """ + unique_values = set(values_to_remap) # Process only unique values + + unmapped_values = [] + for value in unique_values: + if pd.isna(value): # Handle NaN values + self.remap_dict[value] = "unknown" + continue + + cleaned_value = self.clean_string(value) + + # Rule-Based Check (Predefined Mapping) + if cleaned_value in self.standard_map or value in self.standard_map: + self.remap_dict[value] = ( + self.standard_map[cleaned_value] if cleaned_value in self.standard_map else self.standard_map[value] + ) + continue + + if value.lower() in self.standard_map: + self.remap_dict[value] = self.standard_map[value.lower()] + continue + + # Exact Match in Standard Values + if cleaned_value in self.standard_values: + self.remap_dict[value] = cleaned_value + continue + + # Fuzzy Matching + fuzzy_match = self.fuzzy_match(cleaned_value) + if fuzzy_match: + self.remap_dict[value] = fuzzy_match + continue + + # Capture anything that wasn't mapped + unmapped_values.append(value) + + # AI Model - remap anything unmapped (batch request) + ai_mapping = self.ai_standardize(unmapped_values) + self.remap_dict.update(ai_mapping) + + return self.remap_dict + + def report_usage(self): + """Prints a summary of token usage and cost.""" + print(f"\n🔹 Total Tokens Used: {self.total_tokens_used}") + print(f"💰 Estimated Cost: ${self.total_cost:.4f}") diff --git a/asset_list/app.py b/asset_list/app.py new file mode 100644 index 00000000..a284371e --- /dev/null +++ b/asset_list/app.py @@ -0,0 +1,953 @@ +import os +import json +import pandas as pd +from pprint import pprint +import msgpack +from utils.s3 import read_from_s3 +from asset_list.AssetList import AssetList +from asset_list.mappings.property_type import PROPERTY_MAPPING +from asset_list.mappings.built_form import BUILT_FORM_MAPPINGS +from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS +from asset_list.mappings.heating_systems import HEATING_MAPPINGS +from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS +from asset_list.mappings.roof import ROOF_CONSTRUCTION_MAPPINGS +from asset_list.utils import get_data + +from dotenv import load_dotenv +from backend.SearchEpc import SearchEpc + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"): + if method == "first_two_words": + asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ") + return asset_list + + if method == "first_word": + asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0] + return asset_list + + if method == "house_number_extraction": + asset_list["address1_extracted"] = asset_list.apply( + lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]), + axis=1 + ) + return asset_list + + raise ValueError(f"Method {method} not recognized") + + +def app(): + """ + This app is EPC pulling data for some properties owned by Livewest + + Data request contents: + Date of last EPC + Reason for EPC + SAP score on register + Property Type + Property Area + Property Age + Any Dimensions (HLP,PW,RH) + Property Wall Construction + Heating Type + Secondary Heating + Loft Insulation Depth + + Additional if possible: + Heat loss calculations + EPC recommendations + Property UPRN + """ + + # TODO: + # For cavity work: + # - Flag any entries that have a different wall type between non-intrusive data against EPC + # - Worth double checking entries that have a difference in wall construction + # - Look at anything that is flagged as an empty cavity but the EPC data says it’s a filled cavity + # - Look at the current EPC scores - Anything that is C75 or above, especially if it’s assumed no insulation + # - By postcode, we can try and deduce if all of the addresses are a flats and then estimate if 50% of the flats + # are less than C75 + # - Flag anything pre SAP2012 + # - Flag anything over 5 years old + # - Look at year built vs age band + # + # For Solar: + # - Discount any that have solar PV - based on non-intrusives and from the inspections team + # - In the heating, discount anything that isn’t ashp, ghsp, hhrs, electric storage - possibly homes with + # electric room heaters but it might need to be an EPC E + # - Fabric - check the floor, wall and roof: + # - Filled or empty cavity is good + # - Insulated solid/timber/system built is good + # - SCIS/CEG needs solid floors + # - JJC don’t care + # - Anything with a loft 200 or below + # - Anything C75 and above won’t qualify + # - Insulated loft = 200mm + # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid) + # - Or the insulation required is loft/cavity (floors should be solid) + + # Bromford + data_folder = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme " + "Rebuild/Prepared data/") + data_filename = "asset_list.xlsx" + sheet_name = "Sheet1" + postcode_column = 'PostCode' + fulladdress_column = "FullAddress" + address1_column = None + address1_method = "house_number_extraction" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "ConYear" + landlord_os_uprn = None + landlord_property_type = "AssetTypeDesc" + landlord_built_form = "PropTypeDesc" + landlord_wall_construction = "Construction type" + landlord_roof_construction = None + landlord_heating_system = "Heating Type" + landlord_existing_pv = None + landlord_property_id = "Asset" + landlord_sap = None + outcomes_filename = "outcomes.xlsx" + outcomes_sheetname = "Sheet1" + outcomes_postcode = "Postcode" + outcomes_houseno = "No" + outcomes_id = None + outcomes_address = "Address" + master_filepaths = [ + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared data/ECO " + "3 submissions.csv", + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared data/ECO " + "4 submissions.csv", + ] + master_to_asset_list_filepath = None + phase = False + + # Torus + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Torus/Phase 1" + data_filename = "Torus Property Asset List - Phase 1.xlsx" + sheet_name = "TORUS" + postcode_column = 'Postcode' + fulladdress_column = None + address1_column = "AddressLine1" + address1_method = None + address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"] + missing_postcodes_method = None + landlord_year_built = "Property Age" + landlord_os_uprn = "NatUPRN" + landlord_property_type = "Property Type" + landlord_built_form = "Built Form" + landlord_wall_construction = "Wall Construction" + landlord_roof_construction = "Roof Construction" + landlord_heating_system = "Space Heating Source" + landlord_existing_pv = "Low Carbon Technology (Solar PV)" + landlord_property_id = "UPRN" + landlord_sap = "SAP Score" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + outcomes_address = None + master_filepaths = [] + master_to_asset_list_filepath = None + phase = True + + # Ealing - houses + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Ealing" + data_filename = "Ealing_rechecked_cleaned_05042025.csv" + sheet_name = None + postcode_column = 'Postcode' + fulladdress_column = "Address" + address1_column = None + address1_method = "house_number_extraction" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "Year Built" + landlord_os_uprn = None + landlord_property_type = "Property Type Code" + landlord_built_form = None + landlord_wall_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "Property ref" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + outcomes_address = None + master_filepaths = [] + master_to_asset_list_filepath = None + + # Southern Midlands + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern/Midlands Properties - Apr 2025" + data_filename = "Southern Housing Midlands Property List - combined.xlsx" + sheet_name = "Sheet 1" + postcode_column = 'Post Code' + fulladdress_column = "Address" + address1_column = None + address1_method = "house_number_extraction" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "Age_1" + landlord_os_uprn = None + landlord_property_type = "Prop_Type" + landlord_built_form = "Prop_Type" + landlord_wall_construction = "Walls_P" + landlord_heating_system = "Heating System" + landlord_existing_pv = None + landlord_property_id = "AssetID" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + outcomes_address = None + master_filepaths = [] + master_to_asset_list_filepath = None + + # Live West (2018 Asset list) + data_folder = ( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/Programme Update - March 2025/2018 Asset List" + ) + data_filename = "LIVEWEST STOCK - 23rd October 2018.xlsx" + sheet_name = "Assets" + postcode_column = 'Postcode' + fulladdress_column = "Address" + address1_column = None + address1_method = "house_number_extraction" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "Build Year" + landlord_os_uprn = None + landlord_property_type = "Property Archetype" + landlord_built_form = None + landlord_wall_construction = None + landlord_heating_system = "Heating Fuel Type" + landlord_existing_pv = None + landlord_property_id = "Uprn - DO NOT DELETE" + outcomes_filename = "RT - LiveWest.xlsx" + outcomes_sheetname = "Feedback" + outcomes_postcode = "Poscode" + outcomes_houseno = "No." + outcomes_id = "UPRN" + master_filepaths = [ + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/Programme Update - March 2025/Rolling Master " + "- redacted for analysis/CAVITY-Table 1.csv" + ] + master_to_asset_list_filepath = None + + # Live West (South West asset list) + data_folder = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/Programme Update - March " + "2025/Livewest Asset List (Original) - csv") + data_filename = "Report-Table 1.csv" + sheet_name = None + postcode_column = 'Postcode' + fulladdress_column = "T1_Address" + address1_column = None + address1_method = "house_number_extraction" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "Build Yr" + landlord_os_uprn = None + landlord_property_type = "T1_AssetType" + landlord_built_form = "T1_AssetType" + landlord_wall_construction = "Wall Type Cavity" + landlord_heating_system = "Heating Fuel" + landlord_existing_pv = None + landlord_property_id = "T1_UPRN" + outcomes_filename = "RT - LiveWest.xlsx" + outcomes_sheetname = "Feedback" + outcomes_postcode = "Poscode" + outcomes_houseno = "No." + outcomes_id = "UPRN" + master_filepaths = [ + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/Programme Update - March 2025/Rolling Master " + "- redacted for analysis/CAVITY-Table 1.csv" + ] + master_to_asset_list_filepath = None + + # PFP London + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/London" + data_filename = "PFP AREAS SURROUNDING LONDON - JAY, RUTH & LANE.xlsx" + sheet_name = "PFP SURROUNDING LONDON" + postcode_column = 'Postcode' + fulladdress_column = None + address1_column = "AddressLine1" + address1_method = None + address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"] + missing_postcodes_method = None + landlord_year_built = None + landlord_os_uprn = None + landlord_property_type = "Archetype (PFP)" + landlord_built_form = "Archetype (PFP)" + landlord_wall_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "Uprn" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + master_filepaths = [] + master_to_asset_list_filepath = None + + # PFP North-West + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/North-West" + data_filename = "Places for People NORTH WEST - INSPECTIONS MASTER - UPDATE.xlsx" + sheet_name = "CHECKED" + postcode_column = 'Postcode' + fulladdress_column = None + address1_column = "AddressLine1" + address1_method = None + address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"] + missing_postcodes_method = None + landlord_year_built = None + landlord_os_uprn = None + landlord_property_type = "Archetype (PFP)" + landlord_built_form = "Archetype (PFP)" + landlord_wall_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "Uprn" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + master_filepaths = [] + master_to_asset_list_filepath = None + + # PFP North-East + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/North-East" + data_filename = "Places for People NORTH EAST - INSPECTIONS MASTER.xlsx" + sheet_name = "CHECKED" + postcode_column = 'Postcode' + fulladdress_column = None + address1_column = "AddressLine1" + address1_method = None + address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"] + missing_postcodes_method = None + landlord_year_built = None + landlord_os_uprn = None + landlord_property_type = "Archetype (PFP)" + landlord_built_form = "Archetype (PFP)" + landlord_wall_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "Uprn" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + master_filepaths = [] + master_to_asset_list_filepath = None + + # PFP East + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/East" + data_filename = "PFP EAST - Master - DN LN NG NR PE POSTCODES.xlsx" + sheet_name = "PFP EAST" + postcode_column = 'Postcode' + fulladdress_column = None + address1_column = "AddressLine1" + address1_method = None + address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"] + missing_postcodes_method = None + landlord_year_built = None + landlord_os_uprn = None + landlord_property_type = "Archetype (PFP)" + landlord_built_form = "Archetype (PFP)" + landlord_wall_construction = None + landlord_heating_system = None + landlord_existing_pv = None + landlord_property_id = "Uprn" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + outcomes_id = None + master_filepaths = [] + master_to_asset_list_filepath = None + + # Wates + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Wates - " + data_filename = "ECO 4 Wates.xlsx" + sheet_name = "Roadmap Homes" + postcode_column = 'Postcode' + fulladdress_column = None + address1_column = "Address Line 1" + address1_method = None + address_cols_to_concat = ["Address Line 1", "Address Line 2", "Address Line 3"] + missing_postcodes_method = None + landlord_year_built = "Build Year" + landlord_os_uprn = None + landlord_property_type = "Archetype" + landlord_built_form = "Archetype" + landlord_wall_construction = "Wall" + landlord_heating_system = "Heating Type" + landlord_existing_pv = None + landlord_property_id = "UPRN" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + master_filepaths = [] + master_to_asset_list_filepath = None + + # Ealing + # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Ealing/Programme data - 04032025" + # data_filename = "Ealing BC - Property Plus Tenure 25.02.2025.xlsx" + # sheet_name = "IGNORE - FULL MAIN" + # postcode_column = 'Postcode' + # fulladdress_column = "Address" + # address1_column = None + # address1_method = "first_word" + # address_cols_to_concat = [] + # missing_postcodes_method = None + # landlord_year_built = "Year Built" + # landlord_os_uprn = None + # landlord_property_type = "Property Type Code" + # landlord_wall_construction = None + # landlord_heating_system = None + # landlord_existing_pv = None + # landlord_property_id = "Property ref" + + # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester" + # data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx" + # sheet_name = "Sheet1" + # postcode_column = 'Full Address.1' + # fulladdress_column = "Full Address" + # address1_column = None + # address1_method = "first_word" + # address_cols_to_concat = [] + # missing_postcodes_method = None + # landlord_year_built = "Build Date" + # landlord_os_uprn = None + # landlord_property_type = "Property Type" + # landlord_wall_construction = "Wallinsul" + # landlord_heating_system = "HeatSorc" + # landlord_existing_pv = None + # landlord_property_id = "Property Reference" + # outcomes_filename = None + # outcomes_sheetname = None + # outcomes_postcode = None + # outcomes_houseno = None + # master_filepaths = [] + # master_to_asset_list_filepath = None + + # For Westward + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward" + data_filename = "WESTWARD - completed list - 20.03.2025.xlsx" + sheet_name = "Sheet1" + postcode_column = "WFT EDIT Postcode" + fulladdress_column = "Address" + address1_column = None + address1_method = "house_number_extraction" + address_cols_to_concat = [] + missing_postcodes_method = None + landlord_year_built = "Build date" + landlord_os_uprn = "UPRN" + landlord_property_type = "Location type" + landlord_built_form = None + landlord_wall_construction = "Wall Construction (EPC)" + landlord_heating_system = "Heat Source" + landlord_existing_pv = "PV (Y/N)" + landlord_property_id = "Place ref" + outcomes_filename = None + outcomes_sheetname = None + outcomes_postcode = None + outcomes_houseno = None + master_filepaths = [] + master_to_asset_list_filepath = None + outcomes_id = None + + # For ACIS - programme re-build + # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/ACIS/ACIS Full Programme Review March 2025" + # data_filename = "ACIS asset list.xlsx" + # sheet_name = "Assets" + # address1_column = "House No" + # postcode_column = "Postcode" + # landlord_property_id = "UPRN" + # fulladdress_column = None + # address_cols_to_concat = ["House No", "Street", "Town"] + # missing_postcodes_method = None + # address1_method = None + # landlord_year_built = "YEAR BUILT" + # landlord_os_uprn = None + # landlord_property_type = "Property type" + # landlord_built_form = None + # landlord_wall_construction = "Wall Constuction" + # landlord_heating_system = "Heating" + # landlord_existing_pv = None + # outcomes_filename = "ACIS Group - 25.11.2024 - outcomes.xlsx" + # outcomes_sheetname = "Feedback" + # outcomes_postcode = "Postcode" + # outcomes_houseno = "No" + # master_filepaths = [ + # os.path.join(data_folder, "ECO 3 -Table 1.csv"), + # os.path.join(data_folder, "ECO 4 -Table 1.csv"), + # ] + # master_to_asset_list_filepath = None + + # For plus dane + data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Plus Dane" + data_filename = "PLUS DANE Asset List - for analysis.xlsx" + sheet_name = "Asset List" + address1_column = " Address" + postcode_column = " Postcode" + landlord_property_id = "UPRN" + fulladdress_column = " Address" + address_cols_to_concat = [] + missing_postcodes_method = None + address1_method = None + landlord_year_built = "Property Age" + landlord_os_uprn = None + landlord_property_type = "Property Type" + landlord_wall_construction = "Landlord Wall Full" + landlord_heating_system = "Landlord Heating" + landlord_existing_pv = None + outcomes_filename = "plus dane outcomes.xlsx" + outcomes_sheetname = "EVERYTHING" + outcomes_postcode = "Post Code" + outcomes_houseno = "Numb." + master_filepaths = [ + os.path.join(data_folder, "JJC Rolling Master.csv"), + os.path.join(data_folder, "SCIS Rolling Master.csv"), + ] + master_to_asset_list_filepath = os.path.join(data_folder, "surveys_to_assets.csv") + + # Maps addresses to uprn in problematic cases + manual_uprn_map = {} + + asset_list = AssetList( + local_filepath=os.path.join(data_folder, data_filename), + header=0, + sheet_name=sheet_name, + address1_colname=address1_column, + postcode_colname=postcode_column, + landlord_property_id=landlord_property_id, + full_address_colname=fulladdress_column, + full_address_cols_to_concat=address_cols_to_concat, + missing_postcodes_method=missing_postcodes_method, + address1_extraction_method=address1_method, + landlord_year_built=landlord_year_built, + landlord_uprn=landlord_os_uprn, + landlord_property_type=landlord_property_type, + landlord_built_form=landlord_built_form, + landlord_wall_construction=landlord_wall_construction, + landlord_roof_construction=landlord_roof_construction, + landlord_heating_system=landlord_heating_system, + landlord_existing_pv=landlord_existing_pv, + landlord_sap=landlord_sap, + phase=phase + ) + asset_list.init_standardise() + + # We produce the new maps, which can be saved for future useage + new_property_type_map = { + k: v for k, v in ( + asset_list.variable_mappings[asset_list.landlord_property_type] if + asset_list.landlord_property_type else {} + ).items() + if k not in PROPERTY_MAPPING + } + new_built_form_map = { + k: v for k, v in ( + asset_list.variable_mappings[asset_list.landlord_built_form] if + asset_list.landlord_built_form else {} + ).items() + if k not in BUILT_FORM_MAPPINGS + } + new_wall_map = { + k: v for k, v in ( + asset_list.variable_mappings[asset_list.landlord_wall_construction] if + asset_list.landlord_wall_construction else {} + ).items() + if k not in WALL_CONSTRUCTION_MAPPINGS + } + new_heating_map = { + k: v for k, v in ( + asset_list.variable_mappings[asset_list.landlord_heating_system] if + asset_list.landlord_heating_system else {} + ).items() + if k not in HEATING_MAPPINGS + } + new_existing_pv_map = { + k: v for k, v in ( + asset_list.variable_mappings[asset_list.landlord_existing_pv] if asset_list.landlord_existing_pv else {} + ).items() + if k not in EXISTING_PV_MAPPINGS + } + new_roof_construction_map = { + k: v for k, v in ( + asset_list.variable_mappings[asset_list.landlord_roof_construction] if + asset_list.landlord_roof_construction else {} + ).items() + if k not in ROOF_CONSTRUCTION_MAPPINGS + } + + asset_list.apply_standardiation() + + # We now flag properties that have been treated under existing programmes + asset_list.flag_outcomes( + outcomes_filepath=os.path.join(data_folder, outcomes_filename) if outcomes_filename else None, + outcomes_sheetname=outcomes_sheetname, + outcomes_address=outcomes_address, + outcomes_postcode=outcomes_postcode, + outcomes_houseno=outcomes_houseno, + outcomes_id=outcomes_id + ) + + asset_list.flag_survey_master( + master_filepaths=master_filepaths, + master_to_asset_list_filepath=master_to_asset_list_filepath + ) + + ### We retrieve the EPC data + + # We chunk up this data into 5000 rows at a time + # Create the chunks directory + epc_api_only = False + force_retrieve_data = False + skip = None # Used to skip already completed chunks + chunk_size = 1000 + filename = "Chunk {i}.csv" + download_folder = os.path.join(data_folder, "Chunks") + if not os.path.exists(download_folder): + os.makedirs(download_folder) + + chunk_indexes = list(range(0, len(asset_list.standardised_asset_list), chunk_size)) + downloaded_files = {filename.format(i=i) for i in chunk_indexes} + + # We check if we have files associated to these files already and if we do, and we do not want to force the + # fetching of the data, we skip + folder_contents = os.listdir(download_folder) + if all(x in folder_contents for x in downloaded_files): + skip = max(chunk_indexes) + + if any(x in folder_contents for x in downloaded_files): + skip = max([i for i in chunk_indexes if filename.format(i=i) in folder_contents]) + + for i in range(0, len(asset_list.standardised_asset_list), chunk_size): + print(f"Processing chunk {i} to {i + chunk_size}") + if skip is not None and not force_retrieve_data: + if i <= skip: + continue + chunk = asset_list.standardised_asset_list[i:i + chunk_size] + epc_data_chunk, errors_chunk, no_epc_chunk = get_data( + df=chunk, + row_id_name=asset_list.DOMNA_PROPERTY_ID, + uprn_column=AssetList.STANDARD_UPRN, + fulladdress_column=AssetList.STANDARD_FULL_ADDRESS, + address1_column=AssetList.STANDARD_ADDRESS_1, + postcode_column=AssetList.STANDARD_POSTCODE, + property_type_column=AssetList.STANDARD_PROPERTY_TYPE, + built_form_column=AssetList.STANDARD_BUILT_FORM, + manual_uprn_map=manual_uprn_map, + epc_api_only=epc_api_only, + epc_auth_token=EPC_AUTH_TOKEN + ) + + # We now retrieve any failed properties + chunk_failed = chunk[chunk[asset_list.DOMNA_PROPERTY_ID].isin(errors_chunk)] + epc_data_failed, _, _ = get_data( + df=chunk_failed, + row_id_name=asset_list.DOMNA_PROPERTY_ID, + uprn_column=AssetList.STANDARD_UPRN, + fulladdress_column=AssetList.STANDARD_FULL_ADDRESS, + address1_column=AssetList.STANDARD_ADDRESS_1, + postcode_column=AssetList.STANDARD_POSTCODE, + property_type_column=AssetList.STANDARD_PROPERTY_TYPE, + built_form_column=AssetList.STANDARD_BUILT_FORM, + manual_uprn_map=manual_uprn_map, + epc_api_only=epc_api_only, + epc_auth_token=EPC_AUTH_TOKEN + ) + + epc_data_chunk.extend(epc_data_failed) + + # Append the failed data to the main data + # Store the chunk locally as a csv + pd.DataFrame(epc_data_chunk).to_csv(os.path.join(data_folder, f"Chunks/Chunk {i}.csv"), index=False) + # Store the errors and no-data locally + with open(os.path.join(data_folder, f"Chunks/Chunk {i} errors.json"), "w") as f: + json.dump(errors_chunk, f) + + with open(os.path.join(data_folder, f"Chunks/Chunk {i} nodata.csv"), "w") as f: + json.dump(no_epc_chunk, f) + + # We read in and concatenate the created created chunks + # List the contents + epc_data = [] + for file in downloaded_files: + csv_data = pd.read_csv(os.path.join(download_folder, file)) + # We need to convert the recommendations back to a list + csv_data["recommendations"] = csv_data["recommendations"].apply(eval) + # We don't have this if we didn't run the pulling from find my epc + if "find_my_epc_data" in csv_data.columns: + csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval) + epc_data.append(csv_data) + + epc_df = pd.concat(epc_data) + epc_df["estimated"] = epc_df["estimated"].fillna(False) + + # We expand out the recommendations + recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]] + + unique_recommendations = set() + for _, row in recommendations_df.iterrows(): + unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) + + columns = [asset_list.DOMNA_PROPERTY_ID] + list(unique_recommendations) + transformed_data = [] + for _, row in recommendations_df.iterrows(): + # Initialize a dictionary for this row with False for all recommendations + row_data = {col: False for col in columns} + row_data[asset_list.DOMNA_PROPERTY_ID] = row[asset_list.DOMNA_PROPERTY_ID] + + # Set True for each recommendation present in this row + for rec in row["recommendations"]: + recommendation_text = rec["improvement-summary-text"] + row_data[recommendation_text] = True + + # Append the row data to transformed_data + transformed_data.append(row_data) + + transformed_df = pd.DataFrame(transformed_data) + transformed_df = transformed_df[ + [ + asset_list.DOMNA_PROPERTY_ID, "Floor insulation (solid floor)", + "Floor insulation", "Floor insulation (suspended floor)" + ] + ] + + transformed_df["epc_has_floor_recommendation"] = ( + transformed_df["Floor insulation (solid floor)"] | transformed_df["Floor insulation"] | + transformed_df["Floor insulation (suspended floor)"] + ) + + # Get the find my epc data + if "find_my_epc_data" not in epc_df.columns: + epc_df["find_my_epc_data"] = None + + find_my_epc_data = [] + for _, x in epc_df.iterrows(): + if x["find_my_epc_data"]: + find_my_epc_data.append( + { + asset_list.DOMNA_PROPERTY_ID: x[asset_list.DOMNA_PROPERTY_ID], + **x["find_my_epc_data"] + } + ) + else: + find_my_epc_data.append( + { + asset_list.DOMNA_PROPERTY_ID: x[asset_list.DOMNA_PROPERTY_ID] + } + ) + + find_my_epc_data = pd.DataFrame(find_my_epc_data) + + find_my_epc_data = find_my_epc_data.merge( + transformed_df[[asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"]], + how="left", on=asset_list.DOMNA_PROPERTY_ID + ) + + # We check if we get the solar pv column: + if "Solar photovoltaics" not in find_my_epc_data.columns: + find_my_epc_data["Solar photovoltaics"] = False + + # Retrieve just the data we need + epc_df = epc_df[ + [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys()) + ].rename( + columns=asset_list.EPC_API_DATA_NAMES + ) + + # Look for columns not in the find my EPC data, which will have happened if we didn't + # retrieve it in the first place + missed_find_epc_cols = [c for c in list(asset_list.FIND_EPC_DATA_NAMES.keys()) if c not in find_my_epc_data.columns] + if missed_find_epc_cols: + for c in missed_find_epc_cols: + find_my_epc_data[c] = None + + epc_df = epc_df.merge( + find_my_epc_data[ + [asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + list(asset_list.FIND_EPC_DATA_NAMES.keys()) + ] + .rename(columns=asset_list.FIND_EPC_DATA_NAMES), + how="left", + on=asset_list.DOMNA_PROPERTY_ID + ) + + asset_list.merge_data(epc_df) + + asset_list.extract_attributes() + + cleaned = read_from_s3( + s3_file_name="cleaned_epc_data/cleaned.bson", + bucket_name="retrofit-data-dev" + ) + cleaned = msgpack.unpackb(cleaned, raw=False) + + asset_list.identify_worktypes(cleaned) + + pprint(asset_list.work_type_figures) + + asset_list.flat_analysis() + + ################################################################ + # WESTWARD - comparison between Kieran's method & automated + ################################################################ + + # Check 1) + cavity_fills = pd.read_excel( + os.path.join(data_folder, "WESTWARD - Route March Prep.xlsx"), + sheet_name="Straight Fill" + ) + cavity_fills = cavity_fills.merge( + asset_list.standardised_asset_list[ + [asset_list.STANDARD_LANDLORD_PROPERTY_ID, "cavity_reason"] + ], + how="left", + left_on=asset_list.landlord_property_id, + right_on=asset_list.STANDARD_LANDLORD_PROPERTY_ID + ) + cavity_fills["cavity_reason"] = cavity_fills["cavity_reason"].fillna("Not identified") + print(cavity_fills["cavity_reason"].value_counts()) + # Didn't identify 3 properties because they're bedsits + # 4 properties were identified, not based on the non-intrusives but instead because + # Westward said they were built in 2003/2007. Have adjusted this to use the age from the + # epc as well, as EPC says 1975 and they look like 1975 properties + # 37 properties flagged as already having solar - these are all because the landlord said they have solar + # e.g. + # https://earth.google.com/web/search/11+Winsland+Avenue+TOTNES+TQ9+5FT/@50.43354465,-3.71318276,46.57468503a, + # 59.14004365d,35y,0h,0t, + # 0r/data=CpABGmISXAolMHg0ODZkMWQxOGE4NWRiZjdkOjB4YjBhM2E5M2Q3YWVlMWEwYhlZYgp7fzdJQCHFfC9027QNwCohMTEgV2luc2xhbmQgQXZlbnVlIFRPVE5FUyBUUTkgNUZUGAIgASImCiQJbxsQEoo3SUARXQcp_HE3SUAZBmiZGJ6yDcAhCA0fqq63DcBCAggBOgMKATBCAggASg0I____________ARAA + # https://earth.google.com/web/search/15+St+Anne%27s+Ct,+Newton+Abbot+TQ12+1TL/@50.53068337,-3.61611128, + # 11.74908956a,135.73212429d,35y,0h,0t, + # 0r/data=CpUBGmcSYQolMHg0ODZkMDVkMjFhODhjZjgxOjB4MjBmMzE2Zjc3MGI2NGMwYxlCxHLw8UNJQCFZqyzALe4MwComMTUgU3QgQW5uZSdzIEN0LCBOZXd0b24gQWJib3QgVFExMiAxVEwYAiABIiYKJAm-r6U2iDdJQBHS5ICRdDdJQBmYGVpmiLINwCG8wcrtqbYNwEICCAE6AwoBMEICCABKDQj___________8BEAA + + # Check 2) + cavity_fills_with_solar = pd.read_excel( + os.path.join(data_folder, "WESTWARD - Route March Prep.xlsx"), + sheet_name="Solar PV - Straight Fill" + ) + cavity_fills_with_solar = cavity_fills_with_solar.merge( + asset_list.standardised_asset_list[ + [asset_list.STANDARD_LANDLORD_PROPERTY_ID, "cavity_reason"] + ], + how="left", + left_on=asset_list.landlord_property_id, + right_on=asset_list.STANDARD_LANDLORD_PROPERTY_ID + ) + cavity_fills_with_solar["cavity_reason"] = cavity_fills_with_solar["cavity_reason"].fillna("Not identified") + print(cavity_fills_with_solar["cavity_reason"].value_counts()) + # 203 properties total + # 140 properties were flagged up based on non-intrusives (Non-Intrusive Data Showed Empty Cavity) + # 63 property already has solar + + # Check 3) RDF + rdf = pd.read_excel( + os.path.join(data_folder, "WESTWARD - Route March Prep.xlsx"), + sheet_name="RDF CIGA checks" + ) + rdf = rdf.merge( + asset_list.standardised_asset_list[ + [asset_list.STANDARD_LANDLORD_PROPERTY_ID, "cavity_reason", "solar_reason"] + ], + how="left", + left_on=asset_list.landlord_property_id, + right_on=asset_list.STANDARD_LANDLORD_PROPERTY_ID + ) + rdf["cavity_reason"] = rdf["cavity_reason"].fillna("Not identified") + print(rdf["cavity_reason"].value_counts()) + # 264 properties are not identified, 261 of which are due to the fact they contain materials + # The other 3 were determined to be eligible for solar instead + # Many of these units that were identified for rdf works could be solar jobs + + rdf_with_solar = pd.read_excel( + os.path.join(data_folder, "WESTWARD - Route March Prep.xlsx"), + sheet_name="Solar PV - RDF CIGA Checks" + ) + rdf_with_solar = rdf_with_solar.merge( + asset_list.standardised_asset_list[ + [asset_list.STANDARD_LANDLORD_PROPERTY_ID, "cavity_reason", "solar_reason"] + ], + how="left", + left_on=asset_list.landlord_property_id, + right_on=asset_list.STANDARD_LANDLORD_PROPERTY_ID + ) + rdf_with_solar["cavity_reason"] = rdf_with_solar["cavity_reason"].fillna("Not identified") + rdf_with_solar["cavity_reason"].value_counts() + + # All others identified - some flagged as empties due to EPC or landlord data suggesting as much + # 5 not identified due to containing COMPACTED BEAD + + asset_list.standardised_asset_list = asset_list.standardised_asset_list[ + asset_list.standardised_asset_list[asset_list.landlord_property_id] + ] + + asset_list.load_contact_details( + local_filepath=os.path.join(data_folder, "Full property list wth D&V report V look up 12.2.25.xlsx"), + sheet_name="Report 1", + landlord_property_id=asset_list.landlord_property_id, + phone_number_column='Property Current Tel. Number', + fullname_column='Proeprty Current Occupant', + firstname_column=None, + lastname_column=None, + email_column=None, # TODO - we need this + ) + + # Convert to a format suitable for CRM + # TODO: TEMP + assigned_surveyors = pd.DataFrame( + [ + { + asset_list.landlord_property_id: "02610001", + "week_commencing": "10/10/2025", + "surveyor_name": "Khalim Conn-Kowlessar", + "surveyor_email": "khalim@domna.homes", + } + ] + ) + + # TODO: Sort the output by postcode + + company_domain = "ealing.gov.uk" + crm_pipeline_name = "Survey Management" + first_dealstage = "READY TO BEGIN SCHEDULING" + # TODO - temp, upload to either SharePoint or AWS + + asset_list.prepare_for_crm( + assigned_surveyors=assigned_surveyors, + company_domain=company_domain, + crm_pipeline_name=crm_pipeline_name, + first_dealstage=first_dealstage + ) + hubspot_data = asset_list.hubspot_data + + # Store as an excel + filename = os.path.join(data_folder, ".".join(data_filename.split(".")[:-1])) + " - Standardised.xlsx" + # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data + + with pd.ExcelWriter(filename) as writer: + asset_list.standardised_asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False) + asset_list.flat_data.to_excel(writer, sheet_name="Flat Data", index=False) + # If we have outcomes, we add a tab with the outcomes + if not asset_list.outcomes_for_output.empty: + asset_list.outcomes_for_output.to_excel(writer, sheet_name="Outcomes", index=False) + + if not asset_list.unmatched_submissions.empty: + asset_list.unmatched_submissions.to_excel(writer, sheet_name="Unmatched Submissions", index=False) + + if not asset_list.outcomes_no_match.empty: + asset_list.outcomes_no_match.to_excel(writer, sheet_name="Unmatched Outcomes", index=False) + + # Store the Hubspot export as a csv + hubspot_data.to_csv(os.path.join(data_folder, "Hubspot Export.csv"), index=False) diff --git a/asset_list/mappings/built_form.py b/asset_list/mappings/built_form.py new file mode 100644 index 00000000..e103f794 --- /dev/null +++ b/asset_list/mappings/built_form.py @@ -0,0 +1,148 @@ +import numpy as np + +STANDARD_BUILT_FORMS = { + "unknown", + # Houses + "end-terrace", "semi-detached", "detached", "mid-terrace", + # Flats + "ground floor", "mid-floor", "top-floor", "basement" +} + +BUILT_FORM_MAPPINGS = { + 'House (End Terrace)': 'end-terrace', + 'Ground Floor Flat General': 'ground floor', + 'House (Semi)': 'semi-detached', + 'House (Mid Terrace)': 'mid-terrace', + 'Bungalow': 'unknown', + 'House (Mid terrace)': 'mid-terrace', + 'Maisonette': 'unknown', + 'Flat': 'unknown', + 'First Floor Flat General': 'mid-floor', + 'Bungalow (Semi)': 'semi-detached', + + 'Detached House': 'detached', + 'End Terraced House': 'end-terrace', + 'Studio (Ground floor)': 'ground floor', + 'Mid Terraced House': 'mid-terrace', + 'Ground Floor Flat': 'ground floor', + 'Semi Detached House': 'semi-detached', + 'Detached Property': 'detached', + 'Level not confirmed': 'unknown', + 'Bedsit': 'unknown', + 'Cottage': 'detached', + 'Terraced House': 'mid-terrace', + 'Studio (1st Floor)': 'ground floor', + 'Standard Maisonette': 'unknown', + 'Third Floor Flat or Above': 'top-floor', + 'Town House': 'end-terrace', + 'Guest room in a complex': 'unknown', + 'Back To Back House': 'mid-terrace', + 'PIMSS EMPTY': 'unknown', + 'Flat Basement': 'basement', + 'House': 'unknown', + 'Second Floor Flat': 'mid-floor', + 'First Floor Flat': 'ground floor', + 'Room Only': 'unknown', + + 'End Terrace Housex': 'end-terrace', + 'Mid Terrace Bungalow': 'mid-terrace', + 'End Terrace Bungalow': 'end-terrace', + 'Mid Terrace House': 'mid-terrace', + 'Detached Bungalow': 'detached', + 'End Terrace House': 'end-terrace', + 'Mid Terrace Housekeeping ': 'mid-terrace', + 'Semi Detached Bung': 'semi-detached', + 'Guest Room': 'unknown', + 'Coach House': 'detached', + 'Office Buildings': 'unknown', + 'Maisonnette': 'mid-floor', + 'Bedspace': 'unknown', + 'Studio (3rd floor and above)': 'top-floor', + 'Adapted Property For Disabled': 'unknown', + 'Studio (2nd floor)': 'mid-floor', + np.nan: 'unknown', + 'Third Floor Flat': 'mid-floor', + '2 Ext. Wall Flat': 'mid-terrace', + 'Hostel': 'unknown', + 'Flat: Mid Terrace: Mid Floor': 'mid-terrace', + 'Bungalow: SemiDetached': 'semi-detached', + 'Flat: End Terrace: Top Floor': 'end-terrace', + 'Flat: Enclosed End Terrace: Top Floor': 'end-terrace', + 'Maisonette: End Terrace: Ground Floor': 'end-terrace', + 'Flat: End Terrace: Ground Floor': 'end-terrace', + 'Flat: Mid Terrace: Top Floor': 'mid-terrace', + 'House: Detached': 'detached', + 'Flat: End Terrace: Mid Floor': 'end-terrace', + 'House: SemiDetached': 'semi-detached', + 'Flat: Semi Detached: Ground Floor': 'semi-detached', + 'Flat: Semi Detached: Top Floor': 'semi-detached', + 'Flat: Mid Terrace: Ground Floor': 'mid-terrace', + 'House: MidTerrace': 'mid-terrace', + 'House: EndTerrace': 'end-terrace', + 'Bungalow: EndTerrace': 'end-terrace', + 'Bungalow: MidTerrace': 'mid-terrace', + 'Flat: Semi Detached: Mid Floor': 'semi-detached', + 'Maisonette: Mid Terrace: Top Floor': 'mid-terrace', + 'Flat: Enclosed Mid Terrace: Mid Floor': 'mid-terrace', + 'Flat: Enclosed Mid Terrace: Ground Floor': 'mid-terrace', + 'Flat: Detached: Ground Floor': 'detached', + 'Flat: Detached: Mid Floor': 'detached', + 'Flat: Detached: Top Floor': 'detached', + 'Flat: Enclosed End Terrace: Mid Floor': 'end-terrace', + 'Bungalow: Detached': 'detached', + 'Maisonette: End Terrace: Mid Floor': 'end-terrace', + 'Maisonette: Detached: Top Floor': 'detached', + 'Flat: Enclosed End Terrace: Ground Floor': 'end-terrace', + 'Flat: Enclosed Mid Terrace: Top Floor': 'mid-terrace', + 'House: EnclosedEndTerrace': 'end-terrace', + '3 Ext. Wall Flat': 'semi-detached', + 'Bungalow Detached': 'detached', + 'Bungalow End Terrace': 'end-terrace', + 'Bungalow Mid Terrace': 'mid-terrace', + 'Bungalow Semi Detached': 'detached', + 'Maisonette 2 Ext. Wall': 'mid-terrace', + 'Maisonette 3 Ext. Wall': 'semi-detached', + 'End-terrace': 'end-terrace', + 'Mid-terrace': 'mid-terrace', + 'Semi-detached': 'semi-detached', + 'Detached': 'detached', + 'Flat / maisonette': 'unknown', + '2014 onwards': 'unknown', + + 'Semi Detached': 'semi-detached', + 'End Terraced': 'end-terrace', + 'Basement': 'basement', + 'No': 'unknown', + 'Mid Terrace': 'mid-terrace', + 'Link Detached': 'detached', + 'Mid Terraced': 'mid-terrace', + 'Ground Floor': 'ground floor', + 'End Terrace': 'end-terrace', + 'Sheltrd Semi Det': 'semi-detached', + 'Shop': 'unknown', + 'Fourth Floor': 'mid-floor', + 'Terraced': 'mid-terrace', + 'Leasehold Terr': 'mid-terrace', + 'Room': 'unknown', + 'Second Floor': 'mid-floor', + 'Third Floor': 'mid-floor', + 'Office': 'unknown', + 'First Floor Over Arch': 'ground floor', + '16-25 IND-PPL': 'unknown', + 'Seventh Floor': 'top-floor', + 'Sheltered': 'unknown', + 'Shelt Bung End': 'end-terrace', + 'Room In Shared Accommodation': 'unknown', + 'Sheltred Bung Terrace': 'mid-terrace', + 'Garage In Block': 'unknown', + 'First Floor': 'ground floor', + 'First Floor Over Garage': 'ground floor', + 'Leasehold': 'unknown', + 'Sheltred Bung': 'unknown', + 'Garage': 'unknown', + 'Sixth Floor': 'top-floor', + 'Sheltered Bung': 'semi-detached', + 'Guest': 'unknown', + 'Fifth Floor': 'mid-floor' + +} diff --git a/asset_list/mappings/exising_pv.py b/asset_list/mappings/exising_pv.py new file mode 100644 index 00000000..51f5f922 --- /dev/null +++ b/asset_list/mappings/exising_pv.py @@ -0,0 +1,20 @@ +import numpy as np + +STANDARD_EXISTING_PV = { + "already has PV", "no PV", "unknown" +} + +EXISTING_PV_MAPPINGS = { + "NO": "no PV", + "YES": "already has PV", + "no": "no PV", + "yes": "already has PV", + True: "already has PV", + False: "no PV", + np.nan: 'unknown', + 'PV: 2kWp array': 'already has PV', + 'PV: 25% roof area, PV: 3.6kWp array': 'already has PV', + 'PV: 10% roof area, PV: 2kWp array': 'already has PV', + 'PV: 50% roof area': 'already has PV', + 'Solar PV': 'already has PV' +} diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py new file mode 100644 index 00000000..7f2f81f2 --- /dev/null +++ b/asset_list/mappings/heating_systems.py @@ -0,0 +1,206 @@ +import numpy as np + +STANDARD_HEATING_SYSTEMS = { + "gas combi boiler", + "electric storage heaters", + "district heating", + "gas condensing boiler", + "oil boiler", + "gas condensing combi", + "air source heat pump", + "boiler - other fuel", + "ground source heat pump", + "electric radiators", + "other", + "electric boiler", + "unknown", + "communal gas boiler", + "high heat retention storage heaters", + "room heaters", + 'electric fuel', + 'oil fuel', + 'solid fuel', + 'gas combi boiler', + 'unknown', + "electric ceiling", + "electric underfloor", + "no heating" +} + +HEATING_MAPPINGS = { + "Combi - GAS": "gas combi boiler", + "E7 Storage Heaters": "high heat retention storage heaters", + "District heating system": "district heating", + "Condensing Boiler - GAS": "gas condensing boiler", + "Boiler Oil/other": "oil boiler", + "Condensing Combi - Gas": "gas condensing combi", + "Air Source Source Heat Pump": "air source heat pump", + "Biomass Boiler": "boiler - other fuel", + "Ground Source Heat Pump": "ground source heat pump", + "Electric Oil filled radiators": "electric radiators", + "Solid Fuel": "other", + "LPG Boiler": "boiler - other fuel", + "Electric Boiler": "electric boiler", + "No data": "unknown", + "Boiler Communal/Commercial - GAS": "communal gas boiler", + "Eco Electric Radiators": "electric radiators", + "Gas fire": "other", + "Backboiler - Solid fuel": "other", + 'combi - gas': 'gas combi boiler', + 'e7 storage heaters': 'high heat retention storage heaters', + 'district heating system': 'district heating', + 'condensing boiler - gas': 'gas condensing boiler', + 'boiler oil/other': 'oil boiler', + 'condensing combi - gas': 'gas condensing combi', + 'air source source heat pump': 'air source heat pump', + 'biomass boiler': 'boiler - other fuel', + 'ground source heat pump': 'ground source heat pump', + 'electric oil filled radiators': 'electric radiators', + 'solid fuel': 'other', + 'lpg boiler': 'boiler - other fuel', + 'electric boiler': 'electric boiler', + 'no data': 'unknown', 'boiler communal/commercial - gas': 'communal gas boiler', + 'eco electric radiators': 'electric radiators', + 'gas fire': 'other', 'backboiler - solid fuel': 'other', + 'ASHP': 'air source heat pump', + 'COMMHEAT': 'communal gas boiler', + 'GBB': 'gas combi boiler', + 'GFS': 'gas condensing boiler', + 'GWA': 'gas condensing boiler', + 'GWM': 'gas condensing combi', + 'HDU': 'district heating', + 'OILBLR': 'oil boiler', + 'SOLIDFUEL': 'boiler - other fuel', + 'STORHTR': 'electric storage heaters', + np.nan: 'unknown', + 'Oil': 'boiler - other fuel', + 'Gas': 'gas condensing boiler', + 'Electric': 'electric storage heaters', + 'Solid fuel': 'other', + 'No Heat': 'unknown', + 'GSHP': 'ground source heat pump', + + 'Boiler Oil': 'oil boiler', + 'Boiler Electricity': 'electric boiler', + 'Boiler ND': 'unknown', + 'ND Mains gas': 'unknown', + 'Room heaters Mains gas': "room heaters", + 'Heat pump (air) Electricity': 'air source heat pump', + 'Room heaters Electricity': 'electric radiators', + 'Room heaters Oil': 'room heaters', + 'No heating system ND': 'no heating', + 'Heat pump (wet) Electricity': 'ground source heat pump', + 'Room heaters Biomass': 'room heaters', + 'ND Solid fuel': 'unknown', + 'Boiler Mains gas': 'gas combi boiler', + 'Boiler LPG': 'boiler - other fuel', + 'Room heaters Solid fuel': 'room heaters', + 'ND ND': 'unknown', + 'Storage heating Electricity': 'electric storage heaters', + 'ND Electricity': 'unknown', + 'Community heating Community (non-gas)': 'district heating', + 'No heating system N/A': 'no heating', + 'Boiler Solid fuel': 'boiler - other fuel', + 'Community heating Community (mains gas)': 'communal gas boiler', + 'Boiler Biomass': 'boiler - other fuel', + 'No heating system Mains gas': 'no heating', + + 'Storage heaters': 'electric storage heaters', + 'Air Source': 'air source heat pump', + 'Ground source': 'ground source heat pump', + 'OIl': 'boiler - other fuel', + 'Quantum storage heaters (old sh on EPC)': 'high heat retention storage heaters', + 'Quanum Storage heaters': 'high heat retention storage heaters', + 'Quantum storage heaters (Old SH on EPC)': 'high heat retention storage heaters', + 'Quantum storage heaters': 'high heat retention storage heaters', + 'Air Source (EPC says SH)': 'air source heat pump', + 'ASHP - Was logged as oil': 'air source heat pump', + 'Ground Source': 'ground source heat pump', + 'District Heating': 'district heating', + 'Mains Gas (Communal)': 'communal gas boiler', + 'LPG': 'boiler - other fuel', + 'Mains Gas': 'gas condensing boiler', + 'ELECTRIC': 'electric fuel', + 'OIL': 'oil fuel', + 'SOLID FUEL': 'solid fuel', + 'GAS': 'gas combi boiler', + 'DO NOT SURVEY': 'unknown', + 'Gas Boiler': 'gas combi boiler', + 'Communal Gas ': 'communal gas boiler', + 'Communal': 'communal gas boiler', + 'Communal Gas': 'communal gas boiler', + 'Wood Burning Boiler': "boiler - other fuel", + 'Oil Fired Boiler': 'oil boiler', + 'Electric (direct acting) room heaters: Panel, convector or radiant heaters Electricity: Electricity': 'room ' + 'heaters', + 'Electric Storage Systems: Integrated storage+direct-acting heater Electricity: Electricity': 'electric storage ' + 'heaters', + 'Community Heating Systems: Community CHP and boilers (RdSAP) Gas: Mains Gas (Community)': 'communal gas boiler', + 'Boiler: D rated Regular Boiler Gas: Mains Gas': 'gas boiler', + 'Boiler: C rated Combi Gas: Mains Gas': 'gas combi boiler', + 'Electric Storage Systems: Fan storage heaters Electricity: Electricity': 'electric storage heaters', + ' ': 'unknown', + 'Boiler: G rated Regular Boiler Gas: Mains Gas': 'gas boiler', + 'Electric Storage Systems: Modern (slimline) storage heaters Electricity: Electricity': 'electric storage heaters', + 'Boiler: E rated Regular Boiler Gas: Mains Gas': 'gas boiler', + 'Boiler: A rated Regular Boiler Electricity: Electricity': 'electric boiler', + 'Community Heating Systems: Community boilers only (RdSAP) Gas: Mains Gas (Community)': 'communal gas boiler', + 'Boiler: A rated Combi Gas: Mains Gas': 'gas condensing combi', + 'Boiler: A rated CPSU Electricity: Electricity': 'electric boiler', + 'Heat Pump: Electric Heat pumps: Ground source heat pump with flow temperature <= 35°C': 'ground source heat pump', + 'Heat Pump: Electric Heat pumps: Ground source heat pump in other cases': 'ground source heat pump', + 'Electric Storage Systems: High heat retention storage heaters': 'high heat retention storage heaters', + 'Heat Pump: Electric Heat pumps: Air source heat pump with flow temperature <= 35°C': 'air source heat pump', + 'Electric (direct acting) room heaters: Panel, convector or radiant heaters': 'room heaters', + 'Boiler: C rated Combi': 'gas combi boiler', + 'Boiler: B rated Regular Boiler': 'gas condensing boiler', + 'Boiler: E rated Combi': 'gas combi boiler', + 'Boiler: A rated Combi': 'gas combi boiler', + 'Boiler: E rated Regular Boiler': 'gas condensing boiler', + 'Community Heating Systems: Community boilers only (RdSAP)': 'district heating', + 'Boiler: C rated Regular Boiler': 'gas condensing boiler', + 'Boiler: A rated Regular Boiler': 'gas condensing boiler', + 'Electric Storage Systems: Fan storage heaters': 'electric storage heaters', + 'Boiler: F rated Combi': 'gas combi boiler', + + 'Room heaters': 'room heaters', + 'Room Heaters': 'room heaters', + 'Boiler': 'gas condensing boiler', + 'Heat Pump (Wet)': 'air source heat pump', + 'Community Heating': 'district heating', + 'Heat pump (wet)': 'air source heat pump', + 'Electric ceiling heating': 'electric ceiling', + 'Electric under floor heating': 'electric underfloor', + 'Community heating': 'district heating', + + 'Wet - Radiators Air Source Heat Pump': 'air source heat pump', + 'Wet - Radiators Electric': 'electric boiler', + 'Storage Heaters': 'high heat retention storage heaters', + 'Wet - Radiators Oil': 'oil boiler', + 'Communal Wet - Radiators Gas': 'communal gas boiler', + 'Electric - Storage/Panel Heaters Electric': 'electric storage heaters', + 'Gas Central Heating': 'gas combi boiler', + 'Wet - Radiators Solar': 'other', + 'Electric - Storage/Panel Heaters LPG': 'electric storage heaters', + 'No Heating Solid': 'no heating', + 'Wet - Underfloor Gas': 'gas condensing boiler', + 'No Heating Electric': 'no heating', + 'Oil Fired Central Heating': 'oil boiler', + 'Warm Air Gas': 'other', + 'Communal Boilers': 'communal gas boiler', + 'Wet - Radiators Gas': 'gas combi boiler', + 'Wet - Radiators Solid': 'solid fuel', + 'Wet - Radiators LPG': 'other', + 'No Heating Gas': 'no heating', + 'No Heating': 'no heating', + 'Panel Heaters': 'electric radiators', + 'Rointe Electric Heating': 'electric storage heaters', + 'Underfloor Heating': 'electric underfloor', + 'Air Source Heating': 'air source heat pump', + 'Warm Air Electric': 'other', + 'Communal Wet - Radiators Electric': 'communal gas boiler', + 'Wet - Underfloor Solar': 'other', + 'No Heating Required Gas': 'unknown', + 'Electric - Storage/Panel Heaters Gas': 'electric storage heaters', + 'Electric - Storage/Panel Heaters Solid': 'electric storage heaters' +} diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py new file mode 100644 index 00000000..dc8dbf21 --- /dev/null +++ b/asset_list/mappings/property_type.py @@ -0,0 +1,182 @@ +import numpy as np + +# These are the standard categories for property types +STANDARD_PROPERTY_TYPES = { + "house", "flat", "maisonette", "bungalow", "park home", "block house", "bedsit", "coach house", + "unknown", "other", "block of flats" +} + +# This is a basic mapping that we use to map values that we've seen commonly to standard values +PROPERTY_MAPPING = { + "HOUSE": "house", + "FLAT": "flat", + "MAISONET": "maisonette", + "BUNGALOW": "bungalow", + "BLKHOUS": "block house", + "blkhous": "block house", + "BEDSIT": "bedsit", + "COACHSE": "coach house", + "coachse": "coach house", + 'Admin Unit Type': 'unknown', + 'Block': 'block of flats', + 'Bungalow': 'bungalow', + 'Flat': 'flat', + 'House': 'house', + 'Maisonette': 'maisonette', + 'Stairwell': 'other', + 'MAISON': 'maisonette', + '3 Bed Semi Detached House': 'house', + '3 Bed Mid Terrace House': 'house', + '2 Bed Semi Detached House': 'house', + '4 Bed Semi Detached House': 'house', + '2 Bed End Terrace House': 'house', + '1 Bed Sheltered Bungalow': 'bungalow', + '1 Bed 1st Floor Sheltered Flat': 'flat', + '2 Bed Second Floor Flat': 'flat', + '1 Bed Mid Terrace House': 'house', + '1 Bed End Terrace House': 'house', + '7 Bed Detached House': 'house', + '4 Bed End Terrace House': 'house', + '1 Bed Link House': 'house', + '1 Bed Second Floor Flat': 'flat', + '2 Bed Detached House': 'house', + '1 Bed Ground Floor Flat': 'flat', + '2 Bed Sheltered Bungalow': 'bungalow', + '4 Bed Mid Terrace House': 'house', + '2 Bed Mid Terrace House': 'house', + '2 Bed First Floor Flat': 'flat', + '3 Bed Detached House': 'house', + 'Ground Floor Bedsit': 'bedsit', + '3 Bed Bungalow': 'bungalow', + np.nan: 'unknown', + '5 Bed End Terrace House': 'house', + '1 Bed Grd Floor Sheltered Flat': 'flat', + '3 Bed End Terrace House': 'house', + '2 Bed Second Floor Maisonette': 'maisonette', + '2 Bed Ground Floor Flat': 'flat', + '2 Bed First Floor Maisonette': 'maisonette', + '4 Bed Detached House': 'house', + '1 Bed Bungalow': 'bungalow', + '2 Bed Bungalow': 'bungalow', + 'First Floor Bedsit': 'bedsit', + '3 Bed First Floor Maisonette': 'maisonette', + '2 Bed 1st Floor Sheltered Flat': 'flat', + '1 Bed First Floor Flat': 'flat', + '3 Bed First Floor Flat': 'flat', + 'ND': 'unknown', + 'House (Mid Terrace)': 'house', + 'First Floor Flat General': 'flat', + 'House (End Terrace)': 'house', + 'House (Mid terrace)': 'house', + 'Bungalow (Semi)': 'bungalow', + 'Ground Floor Flat General': 'flat', + 'House (Semi)': 'house', + 'Detached House': 'house', + 'Bedsit': 'bedsit', + 'Terraced House': 'house', + 'Standard Maisonette': 'maisonette', + 'End Terraced House': 'house', + 'Third Floor Flat or Above': 'flat', + 'Town House': 'house', + 'Mid Terraced House': 'house', + 'Back To Back House': 'house', + 'Flat Basement': 'flat', + 'Ground Floor Flat': 'flat', + 'Semi Detached House': 'house', + 'Second Floor Flat': 'flat', + 'First Floor Flat': 'flat', + 'Level not confirmed': 'flat', + 'Cottage': 'house', + 'Studio (1st Floor)': 'flat', + 'Studio (Ground floor)': 'flat', + 'Guest room in a complex': 'other', + 'PIMSS EMPTY': 'bedsit', + 'Room Only': 'other', + 'Detached Property': 'house', + 'End Terrace Housex': 'house', + 'Coach House': 'coach house', + 'Mid Terrace Bungalow': 'bungalow', + 'End Terrace Bungalow': 'bungalow', + 'Mid Terrace House': 'house', + 'Detached Bungalow': 'bungalow', + 'End Terrace House': 'house', + 'Mid Terrace Housekeeping ': 'house', + 'Maisonnette': 'maisonette', + 'Guest Room': 'unknown', + 'Office Buildings': 'unknown', + 'Semi Detached Bung': 'bungalow', + 'Bedspace': 'bedsit', + 'Houses/Bungalows': 'bungalow', + 'Bedsits': 'bedsit', + 'Unknown': 'unknown', + 'Sheltered Flats/besits': 'flat', + 'House/Bungalow ': 'bungalow', + 'Low/Med Rise Flats/Mais': 'flat', + 'Staff/Comm': 'other', + 'A Rooms': 'other', + 'Studio (3rd floor and above)': 'flat', + 'Adapted Property For Disabled': 'unknown', + 'Studio (2nd floor)': 'flat', + 'Third Floor Flat': 'flat', + '2 Ext. Wall Flat': 'flat', + 'Hostel': 'other', + 'House: MidTerrace': 'house', + 'House: EndTerrace': 'house', + 'Flat: Mid Terrace: Mid Floor': 'flat', + 'Bungalow: SemiDetached': 'bungalow', + 'Bungalow: EndTerrace': 'bungalow', + 'Flat: End Terrace: Top Floor': 'flat', + 'Maisonette: End Terrace: Ground Floor': 'maisonette', + 'Flat: End Terrace: Ground Floor': 'flat', + 'Flat: Mid Terrace: Top Floor': 'flat', + 'House: Detached': 'house', + 'Flat: End Terrace: Mid Floor': 'flat', + 'House: SemiDetached': 'house', + 'Flat: Semi Detached: Ground Floor': 'flat', + 'Flat: Semi Detached: Top Floor': 'flat', + 'Flat: Mid Terrace: Ground Floor': 'flat', + 'Bungalow: MidTerrace': 'bungalow', + 'Flat: Enclosed End Terrace: Top Floor': 'flat', + 'Flat: Semi Detached: Mid Floor': 'flat', + 'Maisonette: Mid Terrace: Top Floor': 'maisonette', + 'House: EnclosedEndTerrace': 'house', + 'Flat: Detached: Ground Floor': 'flat', + 'Flat: Detached: Mid Floor': 'flat', + 'Flat: Detached: Top Floor': 'flat', + 'Bungalow: Detached': 'bungalow', + 'Maisonette: End Terrace: Mid Floor': 'maisonette', + 'Maisonette: Detached: Top Floor': 'maisonette', + 'Flat: Enclosed Mid Terrace: Mid Floor': 'flat', + 'Flat: Enclosed Mid Terrace: Ground Floor': 'flat', + 'Flat: Enclosed End Terrace: Mid Floor': 'flat', + 'Flat: Enclosed End Terrace: Ground Floor': 'flat', + 'Flat: Enclosed Mid Terrace: Top Floor': 'flat', + '2013 onwards': 'unknown', + + 'House 2 Storey': 'house', + 'Bung': 'bungalow', + 'House 3 Storey': 'house', + 'Shared Flat': 'flat', + 'd': 'unknown', + 'Mais': 'maisonette', + 'e': 'unknown', + 'Shared House': 'house', + 'House 4 Storey': 'house', + 'Shared Bungalow': 'bungalow', + 'Detch': 'house', + 'Shop': 'other', + 'Terr': 'house', + 'Terrace': 'house', + 'Description': 'unknown', + 'Hse': 'house', + 'Room': 'other', + 'Office': 'other', + 'Room In Shared Accommodation': 'other', + 'Apartment': 'flat', + 'm': 'unknown', + 'Garage': 'other', + 'Parking Space': 'other', + 'Community Centre': 'other', + 'Communal Facility': 'other', + 'Semi': 'house' +} diff --git a/asset_list/mappings/roof.py b/asset_list/mappings/roof.py new file mode 100644 index 00000000..a95f0529 --- /dev/null +++ b/asset_list/mappings/roof.py @@ -0,0 +1,27 @@ +import numpy as np + +STANDARD_ROOF_CONSTRUCTIONS = { + "pitched access to loft", + "pitched no access to loft", + "pitched unknown access to loft", + "piched unknown insulation", + "pitched insulated", + "another dwelling above", + "flat unknown insulation", + "unknown insulated", + "unknown", +} + +ROOF_CONSTRUCTION_MAPPINGS = { + 'Flat': 'flat unknown insulation', + 'Pitched (access to loft)': 'pitched access to loft', + 'Pitched (no access to loft)': 'pitched no access to loft', + 'Another dwelling above': 'another dwelling above', + 'Same dwelling above': 'another dwelling above', + 'As-built': 'unknown', + 'ND (inferred)': 'unknown', + '2018 onwards': 'unknown', + 'Pitched (vaulted ceiling)': 'pitched insulated', + np.nan: "unknown", + None: "unknown" +} diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py new file mode 100644 index 00000000..c327338a --- /dev/null +++ b/asset_list/mappings/walls.py @@ -0,0 +1,170 @@ +import numpy as np + +STANDARD_WALL_CONSTRUCTIONS = { + # Cavity + "uninsulated cavity", "filled cavity", "partial insulated cavity", "cavity unknown insulation", + # Solic Brick + "uninsulated solid brick", "insulated solid brick", "solid brick unknown insulation", + # Timber Frame + "timber frame unknown insulation", "insulated timber frame", "uninsulated timber frame", + "system built", "granite or whinstone", "other", + "unknown", "sandstone or limestone", + "cob", + "new build - average thermal transmittance", +} + +WALL_CONSTRUCTION_MAPPINGS = { + "New Build - Average Thermal Transmittance": "new build - average thermal transmittance", + 'Average thermal transmittance 0.25 W/m?K': 'unknown', + 'Cavity wall, as built, insulated (assumed)': 'filled cavity', + 'Average thermal transmittance 0.31 W/m?K': 'unknown', + 'Cavity wall, as built, no insulation (assumed)': 'uninsulated cavity', + 'Average thermal transmittance 0.30 W/m?K': 'unknown', 'Average thermal transmittance 0.28 W/m-¦K': 'unknown', + 'Average thermal transmittance 0.25 W/m-¦K': 'unknown', 'Average thermal transmittance 0.21 W/m-¦K': 'unknown', + 'Average thermal transmittance 0.20 W/m-¦K': 'unknown', 'Average thermal transmittance 0.29 W/m?K': 'unknown', + 'Average thermal transmittance 0.16 W/m?K': 'unknown', + 'Average thermal transmittance 0.27 W/m²K': 'unknown', + 'Average thermal transmittance 0.15 W/m-¦K': 'unknown', 'Average thermal transmittance 0.23 W/m-¦K': 'unknown', + 'Average thermal transmittance 0.18 W/m?K': 'unknown', + 'Granite or whin, with internal insulation': 'granite or whinstone', + "Granite or whinstone, as built, insulated (assumed)": "granite or whinstone", + 'Average thermal transmittance 0.22 W/m-¦K': 'unknown', 'Average thermal transmittance 0.24 W/m?K': 'unknown', + 'Average thermal transmittance 0.16 W/m-¦K': 'unknown', 'Average thermal transmittance 0.35 W/m?K': 'unknown', + 'Average thermal transmittance 0.26 W/m-¦K': 'unknown', 'Average thermal transmittance 0.62 W/m?K': 'unknown', + 'Average thermal transmittance 0.64 W/m?K': 'unknown', 'Average thermal transmittance 0.61 W/m?K': 'unknown', + 'Sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone', + 'Average thermal transmittance 0.33 W/m?K': 'unknown', + 'Cavity wall,': "cavity unknown insulation", + 'Cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity', + 'Average thermal transmittance 0.29 W/m-¦K': 'unknown', 'Average thermal transmittance 0.32 W/m-¦K': 'unknown', + 'Average thermal transmittance 0.19 W/m-¦K': 'unknown', 'Average thermal transmittance 0.27 W/m?K': 'unknown', + 'Average thermal transmittance 0.22 W/m?K': 'unknown', 'Average thermal transmittance 0.38 W/m?K': 'unknown', + 'Average thermal transmittance 0.26 W/m?K': 'unknown', 'Average thermal transmittance 0.27 W/m-¦K': 'unknown', + 'Average thermal transmittance 0.18 W/m-¦K': 'unknown', 'Average thermal transmittance = 0.27 W/m?K': 'unknown', + 'Cavity wall, with external insulation': 'filled cavity', 'Average thermal transmittance 0.21 W/m?K': 'unknown', + 'Average thermal transmittance 0.23 W/m?K': 'unknown', 'Average thermal transmittance 0.20 W/m?K': 'unknown', + 'Average thermal transmittance 0.32 W/m?K': 'unknown', 'Average thermal transmittance 0.24 W/m-¦K': 'unknown', + 'Cavity wall, with internal insulation': 'filled cavity', + 'Average thermal transmittance 0.17 W/m-¦K': 'unknown', 'Average thermal transmittance 0.28 W/m?K': 'unknown', + 'new build - average thermal transmittance': 'new build - average thermal transmittance', + 'average thermal transmittance 0.25 w/m?k': 'unknown', + 'cavity wall, as built, insulated (assumed)': 'filled cavity', + 'average thermal transmittance 0.31 w/m?k': 'unknown', + 'cavity wall, as built, no insulation (assumed)': 'uninsulated cavity', + 'average thermal transmittance 0.30 w/m?k': 'unknown', 'average thermal transmittance 0.28 w/m-¦k': 'unknown', + 'average thermal transmittance 0.25 w/m-¦k': 'unknown', 'average thermal transmittance 0.21 w/m-¦k': 'unknown', + 'average thermal transmittance 0.20 w/m-¦k': 'unknown', 'average thermal transmittance 0.29 w/m?k': 'unknown', + 'average thermal transmittance 0.16 w/m?k': 'unknown', 'average thermal transmittance 0.27 w/m²k': 'unknown', + 'average thermal transmittance 0.15 w/m-¦k': 'unknown', 'average thermal transmittance 0.23 w/m-¦k': 'unknown', + 'average thermal transmittance 0.18 w/m?k': 'unknown', + 'granite or whin, with internal insulation': 'granite or whinstone', + 'average thermal transmittance 0.22 w/m-¦k': 'unknown', 'average thermal transmittance 0.24 w/m?k': 'unknown', + 'average thermal transmittance 0.16 w/m-¦k': 'unknown', 'average thermal transmittance 0.35 w/m?k': 'unknown', + 'average thermal transmittance 0.26 w/m-¦k': 'unknown', 'average thermal transmittance 0.62 w/m?k': 'unknown', + 'average thermal transmittance 0.64 w/m?k': 'unknown', 'average thermal transmittance 0.61 w/m?k': 'unknown', + 'sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone', + 'average thermal transmittance 0.33 w/m?k': 'unknown', 'cavity wall,': "cavity unknown insulation", + 'cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity', + 'average thermal transmittance 0.29 w/m-¦k': 'unknown', 'average thermal transmittance 0.32 w/m-¦k': 'unknown', + 'average thermal transmittance 0.19 w/m-¦k': 'unknown', 'average thermal transmittance 0.27 w/m?k': 'unknown', + 'average thermal transmittance 0.22 w/m?k': 'unknown', 'average thermal transmittance 0.38 w/m?k': 'unknown', + 'average thermal transmittance 0.26 w/m?k': 'unknown', 'average thermal transmittance 0.27 w/m-¦k': 'unknown', + 'average thermal transmittance 0.18 w/m-¦k': 'unknown', 'average thermal transmittance = 0.27 w/m?k': 'unknown', + 'cavity wall, with external insulation': 'filled cavity', 'average thermal transmittance 0.21 w/m?k': 'unknown', + 'average thermal transmittance 0.23 w/m?k': 'unknown', 'average thermal transmittance 0.20 w/m?k': 'unknown', + 'average thermal transmittance 0.32 w/m?k': 'unknown', 'average thermal transmittance 0.24 w/m-¦k': 'unknown', + 'cavity wall, with internal insulation': 'filled cavity', 'average thermal transmittance 0.17 w/m-¦k': 'unknown', + 'average thermal transmittance 0.28 w/m?k': 'unknown', + 'Cavity wall, filled cavity': 'filled cavity', + 'Cavity wall, filled cavity and external insulation': 'filled cavity', + 'Granite or whinstone, as built, no insulation (assumed)': 'granite or whinstone', + 'Solid brick, as built, insulated (assumed)': 'insulated solid brick', + 'Solid brick, as built, no insulation (assumed)': 'uninsulated solid brick', + 'Solid brick, with external insulation': 'insulated solid brick', + 'Solid brick, with internal insulation': 'insulated solid brick', + 'System built, as built, insulated (assumed)': 'system built', + 'System built, as built, no insulation (assumed)': 'system built', + 'System built, with external insulation': 'system built', + 'System built, with internal insulation': 'system built', + 'Timber frame, as built, insulated (assumed)': 'timber frame', + 'Timber frame, as built, no insulation (assumed)': 'timber frame', + 'Timber frame, as built, partial insulation (assumed)': 'timber frame', + 'Timber frame, with additional insulation': 'timber frame', + 'CAVITY': 'cavity unknown insulation', + 'COMB': 'unknown', + 'NONE': 'unknown', + 'NOTKNOWN': 'unknown', + 'SOLID': 'solid brick unknown insulation', + np.nan: 'unknown', + 'RENDER/TIMBER FRAME': 'timber frame', + 'SYSTEM BUILT': 'system built', + 'PCC PANELS': 'other', + 'NOT APPLICABLE - FLAT': 'unknown', + 'BRICK/TIMBER FRAME': 'timber frame', + 'BRICK/BLOCK CAVITY': 'cavity unknown insulation', + 'STONE SOLID': 'sandstone or limestone', + 'EXT CLADDING SYSTEM': 'system built', + 'BRICK/BLOCK SOLID': 'solid brick unknown insulation', + + 'Cavity Filled cavity (with internal/external)': 'filled cavity', + 'ND (inferred) Filled cavity': 'filled cavity', + 'Cavity Filled cavity': 'filled cavity', + 'Cavity Unknown insulation': 'cavity unknown insulation', + 'Timber frame As-built': 'timber frame', + 'System build Unknown insulation': 'system built', + 'Cavity As-built': 'uninsulated cavity', + 'System build External': 'system built', + 'ND (inferred) ND (inferred)': 'unknown', + 'Solid brick External': 'insulated solid brick', + 'Cavity External': 'filled cavity', + 'System build As-built': 'system built', + 'Solid brick Internal': 'insulated solid brick', + 'Cavity Internal': 'filled cavity', + 'System build Internal': 'system built', + 'Solid brick As-built': 'solid brick unknown insulation', + + 'Cavity ': 'cavity unknown insulation', + 'Solid brick ': 'solid brick unknown insulation', + 'Timber frame Timber frame (good insulation)': 'insulated timber frame', + ' ': 'unknown', + 'Cavity No data': 'cavity unknown insulation', + 'Non trad ': 'other', + 'Solid brick / Multiple Attributes ': 'solid brick unknown insulation', + 'Cavity Believe CWI done by Dyson': 'filled cavity', + 'Cavity CWI required': 'uninsulated cavity', + 'Solid brick EWI installed': 'insulated solid brick', + 'Cavity Cavity batts': 'filled cavity', + 'Cavity CWI Completed by Dyson': 'filled cavity', + None: "unknown", + "Cavity": "cavity unknown insulation", + 'SolidBrick: Unknown': 'solid brick unknown insulation', + 'Cavity: Unknown': 'cavity unknown insulation', + 'Cavity: AsBuilt (Post 1995)': 'filled cavity', + 'Cavity: AsBuilt (1976-1982)': 'cavity unknown insulation', + 'SystemBuilt: AsBuilt': 'system built', + 'TimberFrame: AsBuilt': "timber frame unknown insulation", + 'Cavity: AsBuilt (1983-1995)': 'cavity unknown insulation', + 'Cavity: AsBuilt (1983-1995), Cavity: FilledCavity': 'filled cavity', + 'SolidBrick: AsBuilt': 'solid brick unknown insulation', + 'Cavity: FilledCavity': 'filled cavity', + 'SolidBrick: Internal': 'insulated solid brick', + 'Cavity: External': 'filled cavity', + 'Sandstone: Internal': 'sandstone or limestone', + 'Cavity: AsBuilt (Pre 1976)': 'cavity unknown insulation', + 'System build': 'system built', + 'Solid brick': 'solid brick unknown insulation', + 'Stone': 'sandstone or limestone', + 'Timber frame': 'timber frame unknown insulation', + '2017 onwards': 'new build - average thermal transmittance', + 'ND (inferred)': 'unknown', + 'Flat / maisonette': 'other', + + 'Other': 'other', + 'Timber Frame': 'timber frame unknown insulation', + 'Cavity Wall': 'cavity unknown insulation', + 'Non-Traditional': 'system built', + 'PRC': 'system built', + 'Cross Wall': 'system built', + 'Solid Wall': 'solid brick unknown insulation', + 'Traditional': 'other' +} diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt new file mode 100644 index 00000000..fd43ac64 --- /dev/null +++ b/asset_list/requirements.txt @@ -0,0 +1,12 @@ +postal +pandas +usaddress +pydantic-settings==2.6.0 +epc-api-python==1.0.2 +fuzzywuzzy +boto3 +openpyxl +openai +tiktoken +msgpack +beautifulsoup4 \ No newline at end of file diff --git a/asset_list/tests/test_standardisation.py b/asset_list/tests/test_standardisation.py new file mode 100644 index 00000000..b6d9a391 --- /dev/null +++ b/asset_list/tests/test_standardisation.py @@ -0,0 +1,5 @@ +from asset_list.AssetList import AssetList + + +def test_multi_unit_address_flagging(): + assert AssetList._identify_multi_address('Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL') diff --git a/asset_list/utils.py b/asset_list/utils.py new file mode 100644 index 00000000..ff9db3f8 --- /dev/null +++ b/asset_list/utils.py @@ -0,0 +1,183 @@ +import time +import numpy as np +import pandas as pd +from backend.SearchEpc import SearchEpc +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc +from tqdm import tqdm +from utils.logger import setup_logger + +logger = setup_logger() + + +def get_data( + df, + manual_uprn_map, + epc_auth_token, + uprn_column, + fulladdress_column, + address1_column, + postcode_column, + property_type_column, + built_form_column, + epc_api_only=False, + row_id_name="row_id", +): + # These re-map the standard property types to forms accepted by the EPC api, so we can predict EPCs + property_type_map = { + "house": "House", + "flat": "Flat", + "maisonette": "Maisonette", + "bungalow": "Bungalow", + "block house": "House", + "coach house": "House", + "bedsit": "Flat" + } + + built_form_map = { + "mid-terrace": "Mid-Terrace", + "end-terrace": "End-Terrace", + "semi-detached": "Semi-Detached", + "detached": "Detached" + } + + epc_data = [] + errors = [] + no_epc = [] + for _, home in tqdm(df.iterrows(), total=len(df)): + try: + + # If we have a block of flats, we cannot retrieve this data + if home.get(property_type_column) == "block of flats": + no_epc.append(home[row_id_name]) + continue + + postcode = home[postcode_column] + house_number = str(home[address1_column]).strip() + full_address = home[fulladdress_column].strip() + house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode) + if house_no is None: + house_no = house_number + uprn = manual_uprn_map.get(full_address, None) + if uprn is None and home.get(uprn_column): + uprn = home[uprn_column] + + if pd.isnull(uprn): + uprn = None + + property_type = property_type_map.get(home.get(property_type_column), None) + built_form = built_form_map.get(home.get(built_form_column)) + + searcher = SearchEpc( + address1=str(house_no), + postcode=postcode, + auth_token=epc_auth_token, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5, + uprn=uprn + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + + # Check if we have a flat or appartment + if searcher.newest_epc is None and uprn is None: + # Try again: + if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None: + # Backup + add1 = full_address.split(",") + if len(add1) > 1: + add1 = add1[1].strip() + else: + # Try splitting on space + add1 = full_address.split(" ")[0].strip() + + else: + add1 = str(house_number) + searcher = SearchEpc( + address1=add1, + postcode=postcode, + auth_token=epc_auth_token, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + + if ( + "flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in + house_number.lower() + ): + searcher.ordnance_survey_client.property_type = "Flat" + + searcher.find_property(skip_os=True) + + # As a final resort, we estimate the EPC + if property_type is not None and searcher.newest_epc is None: + searcher.ordnance_survey_client.property_type = property_type + searcher.ordnance_survey_client.built_form = built_form + searcher.find_property(skip_os=True) + + if searcher.newest_epc is None: + no_epc.append(home[row_id_name]) + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + if epc_api_only: + epc = { + row_id_name: home[row_id_name], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"] + } + + epc_data.append(epc) + continue + + # Retrieve data from FindMyEPC + try: + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + except ValueError as e: + if "No EPC found" in str(e) and "address1" in searcher.newest_epc: + try: + find_epc_searcher = RetrieveFindMyEpc( + address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + except ValueError as e: + if "No EPC found" in str(e): + find_epc_data = {} + else: + logger.error(f"Error retrieving FindMyEPC data: {e}") + raise Exception(f"Error retrieving FindMyEPC data: {e}") + else: + find_epc_data = {} + except Exception as e: + raise Exception(f"Error retrieving FindMyEPC data: {e}") + time.sleep(np.random.uniform(0.1, 1)) + + epc = { + row_id_name: home[row_id_name], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"], + "find_my_epc_data": find_epc_data, + } + + epc_data.append(epc) + except Exception as e: + errors.append(home[row_id_name]) + time.sleep(5) + + return epc_data, errors, no_epc diff --git a/backend/Funding.py b/backend/Funding.py new file mode 100644 index 00000000..f5f85b9f --- /dev/null +++ b/backend/Funding.py @@ -0,0 +1,413 @@ +import pandas as pd +import numpy as np +from typing import List + +from backend.app.plan.schemas import HousingType + + +class Funding: + """ + Given a property, this class identifies if the home is possibly eligible for funding under + the various funding schemes. It will also calculate the expected amount of funding available + and flag any tenant specific requirements that need to be considered to the funding to be attained + """ + + SCHEMES = ["eco4", "gbis", "whlg"] + + ECO_SAP_SCORE_THREHOLDS = [ + {'Band': 'High_A', 'From': 96.0, 'Up to': 100.0, 'Mid-point': 98.0}, + {'Band': 'Low_A', 'From': 92.0, 'Up to': 96.0, 'Mid-point': 94.0}, + {'Band': 'High_B', 'From': 86.0, 'Up to': 91.0, 'Mid-point': 88.5}, + {'Band': 'Low_B', 'From': 81.0, 'Up to': 86.0, 'Mid-point': 83.5}, + {'Band': 'High_C', 'From': 74.5, 'Up to': 80.0, 'Mid-point': 77.25}, + {'Band': 'Low_C', 'From': 69.0, 'Up to': 74.5, 'Mid-point': 71.75}, + {'Band': 'High_D', 'From': 61.5, 'Up to': 68.0, 'Mid-point': 64.75}, + {'Band': 'Low_D', 'From': 55.0, 'Up to': 61.5, 'Mid-point': 58.25}, + {'Band': 'High_E', 'From': 46.5, 'Up to': 54.0, 'Mid-point': 50.25}, + {'Band': 'Low_E', 'From': 39.0, 'Up to': 46.5, 'Mid-point': 42.75}, + {'Band': 'High_F', 'From': 29.5, 'Up to': 38.0, 'Mid-point': 33.75}, + {'Band': 'Low_F', 'From': 21.0, 'Up to': 29.5, 'Mid-point': 25.25}, + {'Band': 'High_G', 'From': 10.5, 'Up to': 20.0, 'Mid-point': 15.25}, + {'Band': 'Low_G', 'From': 1.0, 'Up to': 10.5, 'Mid-point': 5.75} + ] + + def __init__( + self, + tenure: HousingType, + starting_epc, + starting_sap, + postcode, + floor_area, + council_tax_band, + property_recommendations, + project_scores_matrix, + whlg_eligible_postcodes, + gbis_abs_rate: int, + eco4_abs_rate: int, + ): + """ + Use Pydantic to validate the parameter types + :param tenure: Indicates if the property is a social or private home + :param starting_epc: The current EPC rating of the property + :param starting_sap: The current SAP score for the property + :param floor_area: The total floor area of the property + :param council_tax_band: The council tax band of the property + :param property_recommendations: The recommendations for the property + :param project_scores_matrix: The matrix of project scores for ECO4 + :param whlg_eligible_postcodes: The postcodes eligible for WHLG + :param gbis_abs_rate: The assumed £/abs achieved by the installer for GBIS + :param eco4_abs_rate: The assumed £/abs achieved by the installer for ECO4 + """ + + # TODO: Things we need to include: + # 1) Amount of funding + # 2) Fundable measures, as a subset of measures may be fundable, not all + + self.tenure = tenure + self.starting_epc = starting_epc + self.starting_sap = starting_sap + self.postcode = postcode + self.starting_eco_band = self.sap_to_eco_band(self.starting_sap) + self.floor_area_segment = self.classify_floor_area(floor_area) + self.gbis_abs_rate = gbis_abs_rate + self.eco4_abs_rate = eco4_abs_rate + self.council_tax_band = council_tax_band + + self.recommendations = property_recommendations + + self.measure_types = list({r["measure_type"] for r in property_recommendations if r["default"]}) + + # Load in the eco4 project scores matrix + # Filter the matrix on scores relevant to this property + self.project_scores_matrix = project_scores_matrix[ + (project_scores_matrix["Floor Area Segment"] == self.floor_area_segment) & + (project_scores_matrix["Starting Band"] == self.starting_eco_band) + ] + + # The postcode column is already lower case + self.whlg_eligible_postcodes = whlg_eligible_postcodes[ + whlg_eligible_postcodes["Postcode"] == self.postcode.lower() + ] + + # Store the final outputs + self.gbis_eligibiltiy = {} + self.eco4_eligibility = {} + self.whlg_eligibility = {} + + def output( + self, + scheme: str, + eligible: bool, + types: List[str], + measure_types: List[str], + project_score: float, + estimated_funding: float, + notify_tenant_benefits_requirements: bool, + notify_council_tax_band_requirements: bool, + notify_tenant_low_income_requirements: bool, + innovation_required: bool, + ): + """" + """ + + if scheme not in self.SCHEMES: + raise ValueError("Scheme not recognised") + + return { + "scheme": scheme, + "eligible": eligible, + "type": types, + "measure_types": measure_types, + "project_score": project_score, + "estimated_funding": estimated_funding, + "requires_benefits": notify_tenant_benefits_requirements, + "requires_council_tax_band": notify_council_tax_band_requirements, + "requires_low_income": notify_tenant_low_income_requirements, + "innovation_required": innovation_required, + } + + @staticmethod + def classify_floor_area(floor_area): + if floor_area <= 72: + return "0-72" + + if floor_area <= 97: + return "73-97" + + if floor_area <= 199: + return "98-199" + + return "200" + + def eco4(self): + """ + Checks if a property is eligible for ECO4 + :return: + """ + pass + + def find_gbis_measures(self, measures): + """ + The best measure is one that: + 1) Creates some SAP movement, therefore enables eligiblity + 2) Generates the most funding + 3) Has a reasonable ROI + :return: + """ + measure_table = pd.DataFrame([ + m for m in self.recommendations if + (m["type"] in measures) or (m["measure_type"] in measures) and m["default"] + ]) + + measure_table["post_install_sap"] = measure_table["sap_points"] + self.starting_sap + # We classify the movement + measure_table["Finishing Band"] = np.floor(measure_table["post_install_sap"]).apply( + lambda points: self.sap_to_eco_band(points) + ) + # Remove any measures that generate zero SAP movement + measure_table = measure_table[measure_table["Finishing Band"] != self.starting_eco_band] + + if measure_table.empty: + raise NotImplementedError("No measures available, handle me!") + + # We merge on the project matrix, on post install band + measure_table = measure_table.merge( + self.project_scores_matrix, how="left", on="Finishing Band" + ) + # Cost Savings is the abs + measure_table["estimated_funding"] = measure_table["Cost Savings"] * self.gbis_abs_rate + # We cap any estimated funding at the install cost + measure_table["estimated_funding"] = np.where( + measure_table["estimated_funding"] >= measure_table["total"], + measure_table["total"], + measure_table["estimated_funding"] + ) + + # Sort by the measure that will cost the client the least, per sap point + measure_table["cost_minus_funding"] = measure_table["total"] - measure_table["estimated_funding"] + measure_table["cost_minus_funding_per_sap"] = measure_table["cost_minus_funding"] / measure_table["sap_points"] + measure_table = measure_table.sort_values(["cost_minus_funding_per_sap", "total"], ascending=[True, False]) + + return measure_table[ + ["type", "measure_type", "Cost Savings", "estimated_funding"] + ].rename(columns={"Cost Savings": "project_score"}).to_dict("records") + + def sap_to_eco_band(self, sap_points): + """ + Giuven a sap point score, this function will classify the points into the SAP half-band + :param sap_points: + :return: + """ + + if sap_points > 100: + return "High_A" + + classification = [ + x for x in self.ECO_SAP_SCORE_THREHOLDS if (x["From"] <= sap_points) and (sap_points <= x["Up to"]) + ] + + if len(classification) != 1: + raise Exception("We should have a single classifcation for SAP points to half band") + + return classification[0]['Band'] + + def gbis_prs(self): + """ + Checks if a private rental is eligible for GBIS. There are the following possible options + 1) General Eligibilty, contigent on EPC D-G and council tax band A-D. Excludes CWI, LI and heating + controls + 2) Low income group - contigent on EPC D-G and tenant must receive benefits. Excludes heating controls + 3) GBIS Flex route 1, 3 - Great British Insulation Scheme Routes 1 and 3 are for pre-installation + SAP bands D-G for owner-occupied households, D-E for private rented sector households + (Including F & G if exempt from MEES). If houseold is low income. Excludes heating controls + 4) GBIS Flex route 2 - EPC E - G and low income household. Excludes heating controls + + Eligible measures: + • Solid wall + • pitched roof + • flat roof + • under floor + • solid floor park home and + • room in-roof insulation + + :return: + """ + + valid_measures = [ + "internal_wall_insulation", + "external_wall_insulation", + "flat_roof_insulation", + "suspended_floor_insulation", + "room_roof_insulation", + # Not available for every eligiblity type + "cavity_wall_insulation", + "loft_insulation", + ] + + # General Eligibility + if ( + (self.starting_epc in ["G", "D", "E", "F"]) and + any( + [measure in valid_measures for measure in self.measure_types + if measure not in ["cavity_wall_insulation", "loft_insulation"]] + ) and + (self.council_tax_band in [None, "A", "B", "C", "D"]) + ): + # This function pulls out the various measures that can provide funding under GBIS + recommended_measures = self.find_gbis_measures( + measures=[m for m in valid_measures if m not in ["cavity_wall_insulation", "loft_insulation"]] + ) + # If the council tax band is missing, we nofify the customer that this is a requirement that + # should be checked + return [ + self.output( + scheme="gbis", + eligible=True, + types=[m["type"]], # This is single measure so we only have one type + measure_types=[m["measure_type"]], + project_score=m["project_score"], + estimated_funding=m["estimated_funding"], + notify_tenant_benefits_requirements=False, + notify_council_tax_band_requirements=self.council_tax_band is None, + notify_tenant_low_income_requirements=False, + innovation_required=False + ) for m in recommended_measures + ] + + # Low income/flex + if ( + (self.starting_sap in ["G", "D", "E", "F"]) and + any([measure in valid_measures for measure in self.measure_types]) + ): + # Find the best measure, and can also include CWI/LI but requires the tenant to be + # low inome or on benefits + # We find the best measure for GBIS + recommended_measures = self.find_gbis_measures(measures=valid_measures) + return [ + self.output( + scheme="gbis", + eligible=True, + types=[m["type"]], # This is single measure so we only have one type + measure_types=[m["measure_type"]], + project_score=m["project_score"], + estimated_funding=m["estimated_funding"], + notify_tenant_benefits_requirements=True, + notify_council_tax_band_requirements=False, + notify_tenant_low_income_requirements=True, + innovation_required=False + ) for m in recommended_measures + ] + + # Otherwise, no funding availability + return [] + + def gbis_social(self): + """ + Because this is social housing, we have two typical means for eligibility + 1) EPC D, where an innovation measure is required + 2) EPC G-E, where an innovation measure isn't required + :return: + """ + valid_measures = [ + "internal_wall_insulation", + "external_wall_insulation", + "flat_roof_insulation", + "suspended_floor_insulation", + "room_roof_insulation", + # Not available for every eligiblity type + "cavity_wall_insulation", + "loft_insulation", + "heating_control" + ] + + recommended_measures = self.find_gbis_measures( + measures=valid_measures + ) + + # All measures are available + if self.starting_sap == "D": + return [ + self.output( + scheme="gbis", + eligible=True, + types=[m["type"]], # This is single measure so we only have one type + measure_types=[m["measure_type"]], + project_score=m["project_score"], + estimated_funding=m["estimated_funding"], + notify_tenant_benefits_requirements=False, + notify_council_tax_band_requirements=False, + notify_tenant_low_income_requirements=False, + innovation_required=True + ) for m in recommended_measures + ] + + if self.starting_sap in ["G", "F", "E"]: + return [ + self.output( + scheme="gbis", + eligible=True, + types=[m["type"]], # This is single measure so we only have one type + measure_types=[m["measure_type"]], + project_score=m["project_score"], + estimated_funding=m["estimated_funding"], + notify_tenant_benefits_requirements=False, + notify_council_tax_band_requirements=False, + notify_tenant_low_income_requirements=False, + innovation_required=False + ) for m in recommended_measures + ] + + return [] + + def gbis(self): + """ + Check if a property is eligible for GBIS + :return: + """ + + if self.tenure == "Private": + self.gbis_eligibiltiy = self.gbis_prs() + return + + if self.tenure == "Social": + self.gbis_eligibiltiy = self.gbis_social() + + raise NotImplementedError("Implement social/oo") + + def whlg(self): + if self.tenure == "Social": + # We can't do anything for social housing + self.whlg_eligibility = [] + return + + if not self.whlg_eligible_postcodes.empty: + raise Exception("Implement me") + # self.whlg_eligibility = [ + # self.output( + # scheme, + # eligible, + # types, + # measure_types, + # project_score: float, + # estimated_funding: float, + # notify_tenant_benefits_requirements: bool, + # notify_council_tax_band_requirements: bool, + # notify_tenant_low_income_requirements: bool, + # innovation_required: bool, + # ) + # ] + + def eco4(self): + if self.tenure == "Private": + self.eco4_eligibiltiy = self.eco4_prs() + return + + def check_eligibiltiy(self): + """ + This function instigates the checking process + :return: + """ + + self.gbis() + # self.eco4() + self.whlg() diff --git a/backend/Property.py b/backend/Property.py index 31f207ab..52e8c213 100644 --- a/backend/Property.py +++ b/backend/Property.py @@ -22,6 +22,7 @@ from recommendations.recommendation_utils import ( ) from backend.ml_models.AnnualBillSavings import AnnualBillSavings from backend.app.utils import sap_to_epc +from backend.Funding import Funding import backend.app.assumptions as assumptions ENVIRONMENT = os.environ.get("ENVIRONMENT", "dev") @@ -69,6 +70,10 @@ class Property: # Contains the solar panel optimisation results from the Google Solar API solar_panel_configuration = None + # If true, indicates the floor area has actually been given to us by the owner, and we should use this figure + # instead of the one in the EPC, when we simulate + owner_floor_area = False + def __init__( self, id, @@ -103,7 +108,7 @@ class Property: self.already_installed = ast.literal_eval(already_installed['already_installed']) if already_installed else [] self.non_invasive_recommendations = ( - ast.literal_eval(non_invasive_recommendations['recommendations']) if + non_invasive_recommendations['recommendations'] if non_invasive_recommendations else [] ) # This is a list of measures that have been recommended for the property @@ -132,9 +137,14 @@ class Property: self.energy_cost_estimates = {} self.energy_consumption_estimates = {} + # when storing the energy, we'll also self.energy = { "primary_energy_consumption": epc_record.get("energy_consumption_current"), - "co2_emissions": epc_record.get("co2_emissions_current"), + "epc_co2_emissions": epc_record.get("co2_emissions_current"), + # These will be added in once we estimate the amount of emissions from appliances - using the carbon + # intensity of electricity + "appliances_co2_emissions": None, + "co2_emissions": None } self.ventilation = { "ventilation": epc_record.get("mechanical_ventilation"), @@ -202,6 +212,11 @@ class Property: # TODO: We keep this but only temporarily until we add bathrooms, bedrooms, building id to the condition data self.parse_kwargs(kwargs) + # Funding + self.gbis_eligibiltiy = None + self.eco4_eligibility = None + self.whlg_eligibility = None + @classmethod def extract_kwargs(cls, kwargs): """ @@ -215,25 +230,24 @@ class Property: # as we collect more data from the energy assessment n_bathrooms = kwargs.get("n_bathrooms", None) - if n_bathrooms not in [None, ""]: - # We add on a small value to ensure that the number of bathrooms is rounded up, in case the value is 0.5 - n_bathrooms = int(round(float(n_bathrooms) + 1e-5)) + # We add on a small value to ensure that the number of bathrooms is rounded up, in case the value is 0.5 + n_bathrooms = int(round(float(n_bathrooms) + 1e-5)) if n_bathrooms not in [None, ""] else None n_bedrooms = kwargs.get("n_bedrooms", None) - if n_bedrooms not in [None, ""]: - n_bedrooms = int(round(float(n_bedrooms) + 1e-5)) + n_bedrooms = int(round(float(n_bedrooms) + 1e-5)) if n_bedrooms not in [None, ""] else None number_of_floors = kwargs.get("number_of_floors", None) - if number_of_floors not in [None, ""]: - number_of_floors = int(round(float(number_of_floors) + 1e-5)) + number_of_floors = int(round(float(number_of_floors) + 1e-5)) if number_of_floors not in [None, ""] else None insulation_floor_area = kwargs.get("insulation_floor_area", None) - if insulation_floor_area not in [None, ""]: - insulation_floor_area = float(insulation_floor_area) + insulation_floor_area = float(insulation_floor_area) if insulation_floor_area not in [None, ""] else None insulation_wall_area = kwargs.get("insulation_wall_area", None) - if insulation_wall_area not in [None, ""]: - insulation_wall_area = float(insulation_wall_area) + insulation_wall_area = float(insulation_wall_area) if insulation_wall_area not in [None, ""] else None + + # We allow for the asset owner to provide us with total floor area, in the event of it being incorrect + floor_area = kwargs.get("floor_area", None) + floor_area = float(floor_area) if floor_area not in [None, ""] else None return { "n_bathrooms": n_bathrooms, @@ -242,12 +256,15 @@ class Property: "insulation_floor_area": insulation_floor_area, "insulation_wall_area": insulation_wall_area, "building_id": kwargs.get("building_id", None), + "floor_area": floor_area } def parse_kwargs(self, kwargs): # We extract the elements from kwargs that we recognise. Anything additional is ignored for arg, val in kwargs.items(): if val is not None: + if arg == "floor_area": + self.owner_floor_area = True setattr(self, arg, val) def create_base_difference_epc_record(self, cleaned_lookup: dict): @@ -257,14 +274,7 @@ class Property: It will be the same starting and ending EPC, as we don't have the expected EPC yet """ - # difference_record = self.epc_record - self.epc_record - - # TODO: change these lower and replace in the settings file - # print( - # "CHANGE THE LATEST FIELD TO REMOVE NUMBER HABITABLE ROOMS IF WE WANT TO USE STARTING/ENDING" - # ) fixed_data_col_names = MANDATORY_FIXED_FEATURES + LATEST_FIELD - # print("NEED TO CHANGE THE DASH TO LOWER CASE") fixed_data_col_names = [ x.lower().replace("_", "-") for x in fixed_data_col_names ] @@ -275,8 +285,6 @@ class Property: if k in fixed_data_col_names } - # difference_record.append_fixed_data(fixed_data) - difference_record = self.epc_record.create_EPCDifferenceRecord( self.epc_record, fixed_data ) @@ -285,10 +293,11 @@ class Property: datasets=[difference_record], cleaned_lookup=cleaned_lookup ) - # TODO: adjust the base difference record with the previously calculated u values + features - # estimated_perimeter is different to the perimeter in the epc record - - # self.base_difference_record.df + # If we have variables that have been given to us by the landlord that we know are correct, whereas the EPC + # may not be, we use them + if self.owner_floor_area is not None: + self.base_difference_record.df["total_floor_area_ending"] = self.floor_area + self.base_difference_record.df["estimated_perimeter_ending"] = self.perimeter def simulate_all_representative_recommendations( self, property_representative_recommendations, @@ -374,7 +383,7 @@ class Property: for rec in property_recommendations_by_phase: # We simulate the impact of the recommendation at this current phase, and all of the prior phases - if rec["type"] in ["mechanical_ventilation", "trickle_vents", "draught_proofing"]: + if rec["type"] in ["trickle_vents", "draught_proofing"]: continue scoring_dict = self.create_recommendation_scoring_data( @@ -382,8 +391,8 @@ class Property: recommendation_record=recommendation_record, recommendations=previous_phase_representatives + [rec], primary_recommendation_id=rec["recommendation_id"], - non_invasive_recommendations=self.non_invasive_recommendations, ) + self.recommendations_scoring_data.append(scoring_dict) simulation_epc = self.epc_record.prepared_epc.copy() @@ -426,6 +435,18 @@ class Property: if phase_epc_transformation[k] == v: continue + if k == "hotwater-description": + if ( + v == "From main system" + ) and ( + phase_epc_transformation["mainheat-description"] == "Electric storage heaters" + ) and ( + "Electric immersion" in phase_epc_transformation["hotwater-description"] + ): + # It means we've recommended HHR with electric immersion, and shouldn't overwrite + # the hot water description + continue + raise NotImplementedError( "Already have this key in the phase_epc_transformation - implement me" ) @@ -441,7 +462,7 @@ class Property: if self.simulation_epcs is None: raise ValueError("Simulation EPCs have not been created") - rec_ids = sorted(list(self.simulation_epcs.keys())) + rec_ids = list(self.simulation_epcs.keys()) updated_simulation_epcs = [] for rec_id in rec_ids: sim_epc = self.simulation_epcs[rec_id].copy() @@ -467,15 +488,12 @@ class Property: # Now we havet this data inthe self.updated_simulation_epcs = updated_simulation_epcs - return updated_simulation_epcs - @staticmethod def create_recommendation_scoring_data( property_id, recommendation_record, recommendations: list, primary_recommendation_id: int, - non_invasive_recommendations: list = None, ): """ This function will iterate through a list of recommendations and apply a simulation for each recommendation @@ -484,7 +502,6 @@ class Property: :param recommendation_record: The record of the property, which will be updated :param recommendations: The list of recommendations to apply :param primary_recommendation_id: The id of the primary recommendation, which is used to identify the record - :param non_invasive_recommendations: The list of non-invasive recommendations :return: The updated recommendation record """ @@ -513,7 +530,7 @@ class Property: "internal_wall_insulation", "external_wall_insulation", "cavity_wall_insulation", "cylinder_thermostat", "loft_insulation", "room_roof_insulation", "flat_roof_insulation", "solid_floor_insulation", "suspended_floor_insulation", "mixed_glazing", - "windows_glazing" + "windows_glazing", "mechanical_ventilation" ]: # We update the data, as defined in the recommendaton for prefix in ["walls", "roof", "floor"]: @@ -539,7 +556,7 @@ class Property: "solid_floor_insulation", "suspended_floor_insulation", "windows_glazing", "solar_pv", "heating", "hot_water_tank_insulation", "heating_control", "secondary_heating", "cylinder_thermostat", "mixed_glazing", - "extension_cavity_wall_insulation", + "extension_cavity_wall_insulation", "mechanical_ventilation", ]: raise NotImplementedError( "Implement me, given type %s" % recommendation["type"] @@ -707,6 +724,15 @@ class Property: "unadjusted": unadjusted_kwh_estimates } + # Update carbon with appliances + self.energy["appliances_co2_emissions"] = ( + (unadjusted_kwh_estimates["appliances"] * assumptions.ELECTRICITY_CARBON_INTENSITY) / 1000 + ) + # Re-calculate total CO2 emissions + self.energy["co2_emissions"] = float(np.round( + self.energy["epc_co2_emissions"] + self.energy["appliances_co2_emissions"], 2 + )) + def set_spatial(self, spatial: pd.DataFrame): """ Sets whether the property is in a conservation area given the output of the ConservationAreaClient @@ -1226,6 +1252,15 @@ class Property: if (self.building_id is not None) and (self.solar_panel_configuration is not None): return True + # If the property is in a conservation area, is listed or is a heriage building, solar panels + # become a difficult measure to generally get through planning restrictions and so we do not recommend + # solar panels + if self.is_listed or self.is_heritage: + # If the property is in a conservation area, we can still recommend solar panels + # but they need to be done in a way that is sympathetic to the building. E.g. the panels + # may be installed such that they are not visible from the street + return False + is_valid_property_type = self.data["property-type"] in ["House", "Bungalow", "Maisonette"] is_valid_roof_type = ( self.roof["is_flat"] or self.roof["is_pitched"] or self.roof["is_roof_room"] @@ -1294,3 +1329,11 @@ class Property: ) return electric_consumption + + def insert_funding(self, funding_calulator: Funding): + """ + This method inserts the funding into the property object + """ + self.gbis_eligibiltiy = funding_calulator.gbis_eligibiltiy + self.eco4_eligibility = funding_calulator.eco4_eligibility + self.whlg_eligibility = funding_calulator.whlg_eligibility diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py index 367d8c85..96b7c5de 100644 --- a/backend/SearchEpc.py +++ b/backend/SearchEpc.py @@ -2,6 +2,7 @@ import os import time import re +from urllib.parse import urlencode import usaddress import pandas as pd import numpy as np @@ -95,7 +96,7 @@ vartypes = { 'walls-env-eff': 'str', 'transaction-type': 'str', # 'uprn': "Int64", - 'current-energy-efficiency': 'float', + 'current-energy-efficiency': 'Int64', 'energy-consumption-current': 'float', 'mainheat-description': 'str', 'lighting-cost-current': 'float', @@ -138,8 +139,8 @@ class SearchEpc: } NODATA = { - "status": 201, - "message": "No data", + "status": 204, + "message": "no data", "error": None } @@ -154,7 +155,7 @@ class SearchEpc: uprn: [int, None] = None, size=None, property_type=None, - fast=False + fast=False, ): """ Address lines 1 and postcode are mandatory fields. The other address lines are optional @@ -206,10 +207,15 @@ class SearchEpc: try: # Updated regex to catch house numbers including alphanumeric ones - pattern = r'(?i)(?:flat|apartment)\s*(\d+\w*)|^\s*(\d+\w*)' - match = re.search(pattern, address) - if match: - return next(g for g in match.groups() if g is not None) + pattern = r'(?i)(?:flat|apartment|room)\s*(\d+\w*)|^\s*(\d+\w*)' + match1 = re.search(pattern, address) + if match1: + return next(g for g in match1.groups() if g is not None) + + pattern2 = r'(?i)(flat|apartment|room)\s*([a-zA-Z]?\d+[a-zA-Z]?)' + match2 = re.search(pattern2, address) + if match2: + return match2.group(2) parsed = usaddress.parse(address) # First, try to get the 'OccupancyIdentifier' if 'OccupancyType' is detected @@ -220,7 +226,8 @@ class SearchEpc: continue if part == postcode.split(" ")[1]: continue - return part # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary + return part.rstrip(",") + # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary # number # Fallback to 'AddressNumber' if no 'OccupancyIdentifier' is found @@ -247,46 +254,36 @@ class SearchEpc: else: return None - def get_epc(self, params=None, size=None): - # Get the EPC data with retries - size = size if size is not None else self.size - if params is None: - if self.uprn: - params = {"uprn": self.uprn} - else: - params = {"address": self.address1, "postcode": self.postcode} + def _get_epc(self, params, size): + """ + To be called by get_epc() - not for external usage + """ + + url = os.path.join(self.client.domestic.host, "search") + if size: + url += "?" + urlencode({k: v for k, v in {"size": size}.items() if v}) for retry in range(self.max_retries): try: - if "uprn" in params: - # We use the direct call method inside, since we need to implement uprn as a valid - # parameter for the search function - url = os.path.join(self.client.domestic.host, "search") - response = self.client.domestic.call(method="get", url=url, params=params) - else: - response = self.client.domestic.search(params=params, size=size) + response = self.client.domestic.call(method="get", url=url, params=params) if response: self.data = response - return self.SUCCESS + return { + "response": response, + "msg": self.SUCCESS + } if retry > 0: logger.info("Failed previous attempt but retry successful") # If we got nothing, final try if not response: return { - "status": 204, - "message": "no data", - "error": None + "response": response, + "msg": self.NODATA } - return { - "status": 200, - "message": "success", - "error": None - } - except Exception as e: if retry < self.max_retries - 1: # If not the last retry, wait for 3 seconds before retrying @@ -294,11 +291,66 @@ class SearchEpc: else: # If it's the last retry, we continue return { - "status": 500, - "message": "Could not retrieve EPC data", - "error": str(e) + "response": {}, + "msg": { + "status": 500, + "message": "Could not retrieve EPC data", + "error": str(e) + } } + def get_epc(self, params=None, size=None): + # Get the EPC data with retries + size = size if size is not None else self.size + if params: + output = self._get_epc(params=params, size=size) + if output["msg"]["status"] == 200: + self.data = output["response"] + return output["msg"] + + if not self.uprn and not self.address1 and not self.postcode: + raise ValueError("No search parameters provided") + + uprn_params = {"uprn": self.uprn} if self.uprn else {} + address_params = {} + if self.address1: + address_params["address"] = self.address1 + if self.postcode: + address_params["postcode"] = self.postcode + + # We attempt the search with uprn params + + data = {"rows": []} + api_response = {} + if uprn_params: + api_response = self._get_epc(params=uprn_params, size=size) + if api_response["msg"]["status"] == 200: + data["rows"].extend(api_response["response"]["rows"]) + + # If we were unsuccessful, we then make a second attempt to fetch the data. We find that + # properties are sometimes listed under the wrong UPRN + if address_params: + api_response = self._get_epc(params=address_params, size=size) + if api_response["msg"]["status"] == 200: + # We update the data with the correct uprn + if self.uprn: + for x in api_response["response"]["rows"]: + x["uprn"] = self.uprn + + data["rows"].extend(api_response["response"]["rows"]) + + # We no de-dupe on lmk-key to avoid duplicates + seen = set() + data["rows"] = [ + row for row in data["rows"] + if row["lmk-key"] not in seen and not seen.add(row["lmk-key"]) + ] + + if data["rows"]: + api_response["msg"] = self.SUCCESS + + return api_response["msg"] + def filter_rows(self, rows, property_type=None, address=None): """ This method should not be used when property_type and address are both not None @@ -343,8 +395,12 @@ class SearchEpc: rows_filtered = [r for r in rows if ", ".join([r["address"], r["posttown"]]) == best_match[0]] else: best_match = process.extractOne(address, [r["address"] for r in rows], score_cutoff=0) + # Get the UPRN for the best match + best_match_uprn = {r["uprn"] for r in rows if r["address"] == best_match[0]}.pop() # Get all of the scores - rows_filtered = [r for r in rows if r["address"] == best_match[0]] + rows_filtered = [ + r for r in rows if (r["address"] == best_match[0]) or (r["uprn"] == best_match_uprn) + ] if rows_filtered: return rows_filtered @@ -643,6 +699,7 @@ class SearchEpc: estimation_data = epc_data[[key, "weight", "lodgement-datetime"]].copy() estimation_data = estimation_data[~pd.isnull(estimation_data[key])] estimation_data = estimation_data[~estimation_data[key].isin(Definitions.DATA_ANOMALY_MATCHES)] + if vartype == "Int64": # We have some edge cases where we get the error "invalid literal for int() with base 10: '1.0'" # so this handles this @@ -654,6 +711,13 @@ class SearchEpc: estimated_epc[key] = None continue + if key == "floor-height": + # We speficially handle this, to avoid extreme values + # We check if we have any rows less than 3.5m + if estimation_data[estimation_data["floor-height"].astype(float) <= 3.5].shape[0] > 0: + # Perform the filter + estimation_data = estimation_data[estimation_data["floor-height"].astype(float) <= 3.5] + if vartype == "Int64": estimated_value = self._estimate_int(estimation_data, key) elif vartype == "float": @@ -676,7 +740,30 @@ class SearchEpc: estimated_epc["current-energy-rating"] = sap_to_epc(estimated_epc["current-energy-efficiency"]) + # Convert the cost current and potential variables - to string integers + for variable in ["heating-cost-current", "hot-water-cost-current", "lighting-cost-current", + "heating-cost-potential", "hot-water-cost-potential", "lighting-cost-potential"]: + estimated_epc[variable] = str(int(estimated_epc[variable])) + + # This is a string + estimated_epc["low-energy-fixed-light-count"] = ( + str(estimated_epc["low-energy-fixed-light-count"]) if estimated_epc["low-energy-fixed-light-count"] else "" + ) + # This is an int + estimated_epc["photo-supply"] = ( + int(np.round(estimated_epc["photo-supply"])) if estimated_epc["photo-supply"] else estimated_epc[ + "photo-supply"] + ) + + estimated_epc["co2-emiss-curr-per-floor-area"] = ( + estimated_epc["co2-emissions-current"] / estimated_epc["total-floor-area"] + ) + estimated_epc["postcode"] = self.postcode + if not self.uprn: + # Update self.uprn too + self.uprn = hash(self.address1 + self.postcode) + estimated_epc["uprn"] = self.uprn estimated_epc["address"] = self.full_address # Indicate that this epc was estimated diff --git a/backend/apis/GoogleSolarApi.py b/backend/apis/GoogleSolarApi.py index 75f28ceb..cda32faa 100644 --- a/backend/apis/GoogleSolarApi.py +++ b/backend/apis/GoogleSolarApi.py @@ -9,8 +9,7 @@ from tqdm import tqdm from math import sin, cos, sqrt, atan2, radians from utils.logger import setup_logger -from recommendations.Costs import Costs, MCS_SOLAR_PV_COST_DATA -from etl.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel +from recommendations.Costs import Costs from backend.ml_models.AnnualBillSavings import AnnualBillSavings from backend.Property import Property from backend.app.db.functions.solar_functions import get_solar_data, store_batch_data @@ -51,6 +50,16 @@ class GoogleSolarApi: MIN_UNIT_PANELS = 4 # Minimum number of panels we allow for a domestic building MIN_BUILDING_PANELS = 10 # Minimum number of panels we allow for a block of flats + # Max area of a roof space we allow panels for + PERCENTAGE_OF_ROOF_LIMIT = 0.8 + + # If the roof area that comes back from the solar API is more than 25% larger than the estiamted roof area + # that we calcualte based on the property dimensions, we will correct the roof area + ROOF_AREA_TOLERANCE = 1.25 + + # Error Messages + ENTITY_NOT_FOUND_ERROR = 'Requested entity was not found.' + def __init__(self, api_key, max_retries=5): """ Initialize the GoogleSolarApi class with the provided API key and maximum retries. @@ -109,6 +118,13 @@ class GoogleSolarApi: response.raise_for_status() # Raise an error for bad status codes return response.json() except requests.exceptions.RequestException as e: + if ( + (e.response.status_code == 404) & + (e.response.json()["error"]["message"] == self.ENTITY_NOT_FOUND_ERROR) + ): + logger.warning("No building insights found for the given location.") + return {"error": self.ENTITY_NOT_FOUND_ERROR} + attempt += 1 print(f"Attempt {attempt} failed: {e}") time.sleep(2 ** attempt) # Exponential backoff @@ -152,6 +168,10 @@ class GoogleSolarApi: # If we have no data in the db, or updated_at is more than 6 months if self.insights_data is None or is_outdated: self.insights_data = self.get_building_insights(longitude, latitude, required_quality) + if self.insights_data.get("error") == self.ENTITY_NOT_FOUND_ERROR: + # We use default performance since in this case, we couldn't retrieve data. We don't store + self.panel_performance = self.default_panel_performance(property_instance=property_instance) + return self.need_to_store = True # Extract key data from the insights response @@ -159,12 +179,19 @@ class GoogleSolarApi: # Automatically exclude north-facing segments self.exclude_north_facing_segments(property_instance=property_instance) # If a property is semi-detached, it's possible for us to include segments from an attached unit - if (property_instance.data["built-form"] == "Semi-Detached") and ( - property_instance.data["extension-count"] == 0 - ): - self.exclude_likely_duplicate_surfaces() + if property_instance is not None: + if (property_instance.data["built-form"] == "Semi-Detached") and ( + property_instance.data["extension-count"] == 0 + ): + self.exclude_likely_duplicate_surfaces() + # We constrain the roof area, based on the floor area to be more conservative self.roof_area = self.insights_data["solarPotential"]["wholeRoofStats"]['areaMeters2'] + if ( + self.roof_area > property_instance.roof_area * self.ROOF_AREA_TOLERANCE + ) | (self.roof_area < (2 - self.ROOF_AREA_TOLERANCE) * property_instance.roof_area): + self.roof_area = property_instance.roof_area + self.floor_area = self.insights_data["solarPotential"]["wholeRoofStats"]['groundAreaMeters2'] self.panel_wattage = self.insights_data["solarPotential"]["panelCapacityWatts"] if self.panel_wattage != 400: @@ -179,7 +206,9 @@ class GoogleSolarApi: # We now start finding the solar panel configurations self.optimise_solar_configuration( - energy_consumption=energy_consumption, is_building=is_building, property_instance=property_instance + energy_consumption=energy_consumption, + is_building=is_building, + property_instance=property_instance ) # Finally, if we have a double property, we half the data we stored area @@ -259,8 +288,6 @@ class GoogleSolarApi: # minimum is 4 min_panels = self.MIN_BUILDING_PANELS if is_building else self.MIN_UNIT_PANELS - cost_instance = Costs(property_instance=property_instance) if property_instance is not None else None - # Remove any north facing roof segments panel_performance = [] for config in self.insights_data["solarPotential"].get("solarPanelConfigs", []): @@ -294,14 +321,12 @@ class GoogleSolarApi: if roi_summary["n_panels"].sum() < min_panels: continue - if cost_instance is None: - total_cost = MCS_SOLAR_PV_COST_DATA["average_cost_per_kwh"] * (wattage / 1000) - else: - total_cost = cost_instance.solar_pv( - n_panels=roi_summary["n_panels"].sum(), - has_battery=False, - n_floors=property_instance.number_of_floors, - )["total"] + total_cost = Costs.solar_pv( + n_panels=roi_summary["n_panels"].sum(), + has_battery=False, + # Assume the most amount of scaffolding + n_floors=3 if property_instance is None else property_instance.number_of_floors + )["total"] weighted_ratio = np.average( roi_summary["ratio"].values, weights=roi_summary["generated_dc_energy"].values @@ -491,6 +516,11 @@ class GoogleSolarApi: panel_performance = panel_performance.drop(columns=["n_panels_halved"]) panel_performance = panel_performance[panel_performance["n_panels"] >= min_panels] + # Finally, we prevent pannelled roof area being above a limit + panel_performance = panel_performance[ + panel_performance["panneled_roof_area"] <= self.roof_area * self.PERCENTAGE_OF_ROOF_LIMIT + ] + self.panel_performance = panel_performance def exclude_north_facing_segments(self, property_instance): @@ -792,15 +822,19 @@ class GoogleSolarApi: property_instance = [p for p in input_properties if p.id == unit["property_id"]][0] # At this level, we check if the property is suitable for solar and if now, skip # Or if we have a solar non-invasive recommendation + + non_invasive_rec = next( + (r for r in property_instance.non_invasive_recommendations if r["type"] == "solar_pv"), {} + ).get("array_wattage") + if ( (not property_instance.is_solar_pv_valid()) or - [r for r in property_instance.non_invasive_recommendations if r["type"] == "solar_pv"] + non_invasive_rec is not None ): continue if unit["longitude"] is None or unit["latitude"] is None: # At this point, we've checked that solar PV is valid, and so we provide some defaults - property_instance.set_solar_panel_configuration( solar_panel_configuration={ "insights_data": None, @@ -855,19 +889,19 @@ class GoogleSolarApi: cost_instance = Costs(property_instance=property_instance) - # We return a 2.4 and 4 kwp system + # We return a 1.6 and 3.2 kwp system panel_performance = pd.DataFrame( [ { - 'n_panels': 10, - 'yearly_dc_energy': 4000 * 0.99, # Assumed 99% efficient wattage -> dc + 'n_panels': 8, + 'yearly_dc_energy': 3200 * assumptions.MEDIAN_WATTAGE_TO_DC, 'total_cost': cost_instance.solar_pv( - n_panels=10, has_battery=False, n_floors=property_instance.number_of_floors + n_panels=8, has_battery=False, n_floors=property_instance.number_of_floors )["total"], 'weighted_ratio': None, - 'panneled_roof_area': 10 * assumptions.RDSAP_AREA_PER_PANEL, - 'array_wattage': 4000, - 'initial_ac_kwh_per_year': 4000 * 0.95, # Assumed 95% efficient wattage -> ac + 'panneled_roof_area': 8 * assumptions.RDSAP_AREA_PER_PANEL, + 'array_wattage': 3200, + 'initial_ac_kwh_per_year': 3200 * assumptions.MEDIAN_WATTAGE_TO_AC, 'lifetime_ac_kwh': None, 'lifetime_dc_kwh': None, 'roi': None, @@ -879,15 +913,15 @@ class GoogleSolarApi: 'rank': None }, { - 'n_panels': 6, - 'yearly_dc_energy': 2400 * 0.99, # Assumed 99% efficient wattage -> dc + 'n_panels': 4, + 'yearly_dc_energy': 1600 * assumptions.MEDIAN_WATTAGE_TO_DC, 'total_cost': cost_instance.solar_pv( n_panels=6, has_battery=False, n_floors=property_instance.number_of_floors )["total"], 'weighted_ratio': None, - 'panneled_roof_area': 6 * assumptions.RDSAP_AREA_PER_PANEL, - 'array_wattage': 2400, - 'initial_ac_kwh_per_year': 2400 * 0.95, # Assumed 95% efficient wattage -> ac + 'panneled_roof_area': 4 * assumptions.RDSAP_AREA_PER_PANEL, + 'array_wattage': 1600, + 'initial_ac_kwh_per_year': 1600 * assumptions.MEDIAN_WATTAGE_TO_AC, 'lifetime_ac_kwh': None, 'lifetime_dc_kwh': None, 'roi': None, diff --git a/backend/app/assumptions.py b/backend/app/assumptions.py index 79f2a087..f1090ef3 100644 --- a/backend/app/assumptions.py +++ b/backend/app/assumptions.py @@ -1,7 +1,7 @@ -# Assumes that the average efficiency of an air source heat pump is 250%, taking the median of the 200-400% range, -# which is often quoted as a sensible efficiency range for air source heat pumps. +# We assume that the ASHP efficiency is 280%, which is the minimum that Cotswolds Energy Group achieves, as +# they target this PESSIMISTIC_ASHP_EFFICIENCY = 200 -AVERAGE_ASHP_EFFICIENCY = 250 +AVERAGE_ASHP_EFFICIENCY = 280 # Conservative estimate of the proportion of electricity that will be consumed, whereas the rest will # be exported. These are averages based on Google research. E.g @@ -11,9 +11,15 @@ SOLAR_CONSUMPTION_WITH_BATTERY_PROPORTION = 0.7 # Typically, each solar panel takes up around 3.4 m2 of roof space under RdSAP. This was been verified in Elmhurst RDSAP_AREA_PER_PANEL = 3.4 +# This is a median based on a sample of properties +MEDIAN_WATTAGE_TO_AC = 0.965 +MEDIAN_WATTAGE_TO_DC = 0.99 SOCIAL_TENURES = ["Rented (social)", "rental (social)"] +# Carbon intensity of electricity, as of 16th Jan 2025 +ELECTRICITY_CARBON_INTENSITY = 0.232 + DESCRIPTIONS_TO_FUEL_TYPES = { "Air source heat pump, radiators, electric": { "fuel": "Electricity", "cop": AVERAGE_ASHP_EFFICIENCY / 100 @@ -50,4 +56,12 @@ DESCRIPTIONS_TO_FUEL_TYPES = { }, "Gas instantaneous at point of use": {"fuel": "Natural Gas", "cop": 0.85}, "Room heaters, wood logs": {"fuel": "Wood Logs", "cop": 1}, + "Boiler and radiators, coal": {"fuel": "Coal", "cop": 0.85}, + "From main system, no cylinderstat": {"fuel": "Natural Gas", "cop": 0.85}, } + +# These are the measure types where if there is a ventilation recommendation, we force the inclusion of it +# if one of these has been recommended. +measures_needing_ventilation = [ + "internal_wall_insulation", "external_wall_insulation", "cavity_wall_insulation" +] diff --git a/backend/app/db/functions/recommendations_functions.py b/backend/app/db/functions/recommendations_functions.py index d6e41c61..d26adf66 100644 --- a/backend/app/db/functions/recommendations_functions.py +++ b/backend/app/db/functions/recommendations_functions.py @@ -138,7 +138,7 @@ def upload_recommendations(session: Session, recommendations_to_upload, property "recommendation_id": recommendation_id, "material_id": part["id"], "depth": int(part["depth"]) if part["depth"] else None, - "quantity": part["quantity"], + "quantity": float(part["quantity"]), "quantity_unit": part["quantity_unit"], "estimated_cost": part["total"], } diff --git a/backend/app/db/models/materials.py b/backend/app/db/models/materials.py index f0af3343..9f8abbf4 100644 --- a/backend/app/db/models/materials.py +++ b/backend/app/db/models/materials.py @@ -19,6 +19,7 @@ class MaterialType(enum.Enum): flat_roof_insulation = "flat_roof_insulation" room_roof_insulation = "room_roof_insulation" windows_glazing = "windows_glazing" + cavity_wall_extraction = "cavity_wall_extraction" iwi_wall_demolition = "iwi_wall_demolition" iwi_vapour_barrier = "iwi_vapour_barrier" diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py index 119c2061..80a531bf 100644 --- a/backend/app/plan/router.py +++ b/backend/app/plan/router.py @@ -1,3 +1,4 @@ +import ast import json from datetime import datetime @@ -27,9 +28,11 @@ from backend.app.dependencies import validate_token from backend.app.plan.schemas import PlanTriggerRequest from backend.app.plan.utils import get_cleaned from backend.app.utils import epc_to_sap_lower_bound, sap_to_epc +import backend.app.assumptions as assumptions from backend.ml_models.api import ModelApi from backend.Property import Property +from backend.Funding import Funding from backend.apis.GoogleSolarApi import GoogleSolarApi from recommendations.optimiser.CostOptimiser import CostOptimiser @@ -42,6 +45,7 @@ from backend.ml_models.Valuation import PropertyValuation from etl.bill_savings.KwhData import KwhData from etl.spatial.OpenUprnClient import OpenUprnClient +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc logger = setup_logger() @@ -120,7 +124,7 @@ def extract_portfolio_aggregation_data( # We can now calculate multiple outputs based on default recommendations carbon_savings = sum([r["co2_equivalent_savings"] for r in default_recommendations]) - pre_retrofit_co2 = p.data["co2-emissions-current"] + pre_retrofit_co2 = p.energy["co2_emissions"] post_retrofit_co2 = pre_retrofit_co2 - carbon_savings pre_retrofit_energy_bill = sum(p.current_energy_bill.values()) @@ -337,7 +341,10 @@ def extract_property_request_data( # Because we have some non-invasive recommendations that match on address and postcode, but not UPRN # we need to check existence of uprn - has_uprn = "uprn" in non_invasive_recommendations[0] if non_invasive_recommendations else True + has_uprn = "uprn" in non_invasive_recommendations[0] if non_invasive_recommendations else False + if has_uprn: + has_uprn = non_invasive_recommendations[0]["uprn"] not in ["", None] + if has_uprn: property_non_invasive_recommendations = next(( x for x in non_invasive_recommendations if @@ -352,7 +359,6 @@ def extract_property_request_data( ), {}) if isinstance(property_non_invasive_recommendations.get("recommendations"), str): - import ast property_non_invasive_recommendations["recommendations"] = ast.literal_eval( property_non_invasive_recommendations["recommendations"] ) @@ -363,16 +369,49 @@ def extract_property_request_data( else: transformed.append(rec) - property_non_invasive_recommendations["recommendations"] = str(transformed) + property_non_invasive_recommendations["recommendations"] = transformed - property_valution = next(( - float(x["value"]) for x in valuation_data if - (str(x["uprn"]) == str(uprn)) - ), None) + # Check if the valuation data has uprn + valuation_has_uprn = "uprn" in valuation_data[0] if valuation_data else False + if valuation_has_uprn: + valuation_has_uprn = valuation_data[0]["uprn"] not in ["", None] + + if valuation_has_uprn: + property_valution = next(( + float(x["valuation"]) for x in valuation_data if + (str(x["uprn"]) == str(uprn)) + ), None) + else: + property_valution = next(( + float(x["valuation"]) for x in valuation_data if + (x["address"] == config["address"]) and (x["postcode"] == config["postcode"]) + ), None) return patch, property_already_installed, property_non_invasive_recommendations, property_valution +def get_funding_data(): + """ + This function retrieves the eco project scores matrix and the warm homes local grant funding data + :return: + """ + project_scores_matrix = read_csv_from_s3( + bucket_name=get_settings().DATA_BUCKET, + filepath="funding/ECO4 Full Project Scores Matrix.csv", + ) + project_scores_matrix = pd.DataFrame(project_scores_matrix) + project_scores_matrix.columns = ['Floor Area Segment', 'Starting Band', 'Finishing Band', 'Cost Savings'] + project_scores_matrix["Cost Savings"] = project_scores_matrix["Cost Savings"].astype(float) + + whlg_eligible_postcodes = read_csv_from_s3( + bucket_name=get_settings().DATA_BUCKET, + filepath="funding/whlg eligible postcodes.csv", + ) + whlg_eligible_postcodes = pd.DataFrame(whlg_eligible_postcodes) + + return project_scores_matrix, whlg_eligible_postcodes + + router = APIRouter( prefix="/plan", tags=["plan"], @@ -393,6 +432,14 @@ async def trigger_plan(body: PlanTriggerRequest): session.begin() logger.info("Getting the inputs") plan_input = read_csv_from_s3(bucket_name=get_settings().PLAN_TRIGGER_BUCKET, filepath=body.trigger_file_path) + # Check for duplicate UPRNS + input_uprns = [x.get("uprn") for x in plan_input if "uprn" in x and x.get("uprn")] + + if input_uprns: + # Check for dupes + if len(input_uprns) != len(set(input_uprns)): + raise ValueError("Duplicate UPRNs in the input data") + # If we have patches or overrides, we should read them in here patches, already_installed, non_invasive_recommendations, valuation_data = get_request_property_data(body) @@ -424,13 +471,22 @@ async def trigger_plan(body: PlanTriggerRequest): # Create a record in db property_id, is_new = create_property( - session, body.portfolio_id, epc_searcher.address_clean, epc_searcher.postcode_clean, - epc_searcher.uprn, - energy_assessment + session=session, + portfolio_id=body.portfolio_id, + address=epc_searcher.address_clean, + postcode=epc_searcher.postcode_clean, + uprn=epc_searcher.uprn, + energy_assessment=energy_assessment ) if not is_new and not body.multi_plan: continue + if epc_searcher.newest_epc is None: + raise ValueError( + "No EPCs found for this property and did not estimate - likely need to provide a" + "property type and built form" + ) + if is_new: create_property_targets( session, @@ -459,6 +515,14 @@ async def trigger_plan(body: PlanTriggerRequest): ) ) + # if we have a remote assment data type, we pull the additional data and include it + if body.event_type == "remote_assessment": + logger.info("Retrieving find my epc data") + property_non_invasive_recommendations = RetrieveFindMyEpc.get_from_epc( + epc_searcher.newest_epc + ) + # TODO: We need to determine if we should make a patch, if the EPC is new + epc_records = patch_epc(patch, epc_records) prepared_epc = EPCRecord( @@ -489,7 +553,8 @@ async def trigger_plan(body: PlanTriggerRequest): model_api = ModelApi( portfolio_id=body.portfolio_id, timestamp=created_at, - prediction_buckets=get_prediction_buckets() + prediction_buckets=get_prediction_buckets(), + max_retries=1 ) await model_api.async_warm_up_lambdas( model_prefies=model_api.KWH_MODEL_PREFIXES + model_api.MODEL_PREFIXES @@ -501,6 +566,7 @@ async def trigger_plan(body: PlanTriggerRequest): logger.info("Reading in materials and cleaned datasets") materials = get_materials(session) cleaned = get_cleaned() + eco_project_scores_matrix, whlg_eligible_postcodes = get_funding_data() kwh_client = KwhData(bucket=get_settings().DATA_BUCKET, read_consumption_data=True) @@ -584,8 +650,10 @@ async def trigger_plan(body: PlanTriggerRequest): recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data) recommendations_scoring_data = recommendations_scoring_data.drop( - columns=["rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending", - "carbon_ending"] + columns=[ + "rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending", + "carbon_ending" + ] ) all_predictions = await model_api.async_paginated_predictions( @@ -604,6 +672,7 @@ async def trigger_plan(body: PlanTriggerRequest): property_instance=property_instance, all_predictions=all_predictions, recommendations=recommendations, + representative_recommendations=representative_recommendations ) ) @@ -625,8 +694,6 @@ async def trigger_plan(body: PlanTriggerRequest): ) # We now insert kwh estimates and costs into the recommendations - # TODO: We should join the methodology which maps the heating and hot water descriptions to the fuel types in - # Recommendations, but also the Property class logger.info("Calculating tenant savings - kwh and bills") for property_id in tqdm([p.id for p in input_properties]): property_recommendations = recommendations.get(property_id, []) @@ -636,59 +703,130 @@ async def trigger_plan(body: PlanTriggerRequest): Recommendations.calculate_recommendation_tenant_savings( property_instance=property_instance, kwh_simulation_predictions=kwh_simulation_predictions, - property_recommendations=property_recommendations + property_recommendations=property_recommendations, + ashp_cop=body.ashp_cop ) ) property_instance.current_energy_bill = property_current_energy_bill # Insert the predictions into the recommendations and run the optimiser - # TODO: If a recommendation has a negative impact on SAP, we should remove it - this seems to have become a - # possibility with heating system - # TODO: After optimising, if there are any cheap, quick win measures (e.g. insulate water tank with hot water - # cylinder jacket), we should add these to the recommendations as default - for p in input_properties: if not recommendations.get(p.id): continue - input_measures = prepare_input_measures(recommendations[p.id], body.goal) + # we need to double unlist because we have a list of lists + property_measure_types = {rec["type"] for recs in recommendations[p.id] for rec in recs} - current_sap_points = int(p.data["current-energy-efficiency"]) - target_sap_points = epc_to_sap_lower_bound(body.goal_value) - sap_gain = CostOptimiser.calculate_sap_gain_with_slack(target_sap_points - current_sap_points) + property_required_measures = [ + m for m in recommendations[p.id] if m[0]["type"] in body.required_measures + ] + measures_to_optimise = [ + m for m in recommendations[p.id] if m[0]["type"] not in body.required_measures + ] - if not body.optimise: - if body.goal != "Increasing EPC": - raise NotImplementedError("Only EPC optimisation is currently supported") + # If we have a wall insulation measure, we MUST include mechanical ventilation + # Additionally, if we have required measures, they should also be included. Therefore + # we can discount the number of points required to get to the target SAP band (or increase) + # in the case of ventilation + needs_ventilation = any(x in property_measure_types for x in assumptions.measures_needing_ventilation) + + input_measures = prepare_input_measures(measures_to_optimise, body.goal, needs_ventilation) + + if not input_measures[0]: + # This means that we have no defaults + selected_recommendations = {} solution = [] - for sub_list in input_measures: - # Select the entry with the highest gain, and if tied, choose the one with the lowest cost - best_measure = max(sub_list, key=lambda x: (x['gain'], -x['cost'])) - solution.append(best_measure) else: - if body.budget: - optimiser = GainOptimiser( - input_measures, max_cost=body.budget, max_gain=sap_gain if sap_gain > 0 else 0 + fixed_gain = 0 + if property_required_measures: + # We get the SAP points for the required measures + if body.goal != "Increasing EPC": + raise NotImplementedError("Only EPC optimisation is currently supported") + sap_by_type = [ + {"type": rec["type"], "sap_points": rec["sap_points"]} for recs in property_required_measures + for rec in recs + ] + # We get a MAX sap points per type + max_per_type = ( + pd.DataFrame(sap_by_type).groupby("type")["sap_points"].max().to_dict() ) + fixed_gain = sum(max_per_type.values()) + + property_required_measure_types = {rec["type"] for rec in sap_by_type} + + # if the property needs ventilation, but the measure we optimise didn't include + # venilation we add the points for ventilation as a fixed gain + if needs_ventilation and any( + r in property_required_measure_types for r in assumptions.measures_needing_ventilation + ): + fixed_gain += next( + (r[0]["sap_points"] for r in recommendations[p.id] if + r[0]["type"] == "mechanical_ventilation"), + 0 + ) + + current_sap_points = int(p.data["current-energy-efficiency"]) + + sap_gain = CostOptimiser.calculate_sap_gain_with_slack( + epc_to_sap_lower_bound(body.goal_value) - current_sap_points + ) - fixed_gain + + if not body.optimise: + if body.goal != "Increasing EPC": + raise NotImplementedError("Only EPC optimisation is currently supported") + solution = [] + for sub_list in input_measures: + # Select the entry with the highest gain, and if tied, choose the one with the lowest cost + best_measure = max(sub_list, key=lambda x: (x['gain'], -x['cost'])) + solution.append(best_measure) else: - # The minimum gain is the minimum number of SAP points required to get to the target SAP band - # If the gain is negative, the optimiser will return an empty solution - optimiser = CostOptimiser( - input_measures, - min_gain=sap_gain - ) - optimiser.setup() - optimiser.solve() - solution = optimiser.solution + if body.budget: + optimiser = GainOptimiser( + input_measures, max_cost=body.budget, max_gain=sap_gain if sap_gain > 0 else 0 + ) + else: + # The minimum gain is the minimum number of SAP points required to get to the target SAP band + # If the gain is negative, the optimiser will return an empty solution + optimiser = CostOptimiser( + input_measures, + min_gain=sap_gain + ) - selected_recommendations = {r["id"] for r in solution} + optimiser.setup() + optimiser.solve() + solution = optimiser.solution + + selected_recommendations = {r["id"] for r in solution} + + if property_required_measures: + # We select the cheapest of the required measures, into selected + for recs in property_required_measures: + # We select the cheapest of the required measures + cost_to_id = { + rec["recommendation_id"]: rec["total"] for rec in recs + if rec["recommendation_id"] not in selected_recommendations + } + # Take the recommendation id with the lowers cost + + selected_recommendations.add(min(cost_to_id, key=cost_to_id.get)) + # Update the solution with the selected recommendaitons + solution = [] + for recs in recommendations[p.id]: + for rec in recs: + if rec["recommendation_id"] in selected_recommendations: + solution.append( + { + "id": rec["recommendation_id"], + "cost": rec["total"], + "gain": rec["sap_points"], + "type": rec["type"] + } + ) # If wall insulation is selected, we also include mechanical ventilation as a best practice measure - if any(x in [r["type"] for r in solution] for x in [ - "internal_wall_insulation", "external_wall_insulation", "cavity_wall_insulation" - ]): + if any(x in [r["type"] for r in solution] for x in assumptions.measures_needing_ventilation): ventilation_rec = next( (r[0] for r in recommendations[p.id] if r[0]["type"] == "mechanical_ventilation"), None @@ -717,10 +855,57 @@ async def trigger_plan(body: PlanTriggerRequest): ] # We'll also unlist the recommendations so they're a bit easier to handle from here onwards - final_recommendations = [ + recommendations[p.id] = [ rec for recommendations_by_type in final_recommendations for rec in recommendations_by_type ] - recommendations[p.id] = final_recommendations + + # when we have buildings, we tweak our solar PV recommendations as if one unit needs it, we apply it to all + # of them + # TODO: We can probably do better and optimise at the building level - this is temp + logger.info("Adjusting solar PV recommendations for buildings") + building_ids = set([p.building_id for p in input_properties if p.building_id is not None]) + + for bid in building_ids: + # We check if any of them have solar PV + building = [p for p in input_properties if p.building_id == bid] + has_solar = False + for unit in building: + # Get default recommendations + has_solar = len([r for r in recommendations[unit.id] if r["default"] and r["type"] == "solar_pv"]) > 0 + if has_solar: + break + + if has_solar: + # We adjust the units within the building + for unit in building: + for rec in recommendations[unit.id]: + if rec["type"] == "solar_pv": + # This is straightforward, we just set the default to True, since when we're at a building + # level, we only allow 1 solar PV option for each unit. If we change this, this logic will + # need to be updated + rec["default"] = True + + # ~~~~~~~~~~~~~~~~ + # Funding + # ~~~~~~~~~~~~~~~~ + + # for p in input_properties: + # funding_calulator = Funding( + # tenure=body.housing_type, + # starting_epc=p.data["current-energy-rating"], + # starting_sap=int(p.data["current-energy-efficiency"]), + # postcode=p.postcode, + # floor_area=p.floor_area, + # council_tax_band=None, # This is seemingly always None at the moment + # property_recommendations=recommendations[p.id], + # project_scores_matrix=eco_project_scores_matrix, + # whlg_eligible_postcodes=whlg_eligible_postcodes, + # gbis_abs_rate=15, + # eco4_abs_rate=15, + # ) + # funding_calulator.check_eligibiltiy() + # # Insert finding + # p.insert_funding(funding_calulator) logger.info("Uploading recommendations to the database") # If we have any work to do, we create a new scenario @@ -759,7 +944,11 @@ async def trigger_plan(body: PlanTriggerRequest): new_epc = sap_to_epc(new_sap_points) new_epc_bands[p.id] = new_epc - valuations = PropertyValuation.estimate(property_instance=p, target_epc=new_epc) + total_cost = sum([r["total"] for r in default_recommendations]) + + valuations = PropertyValuation.estimate( + property_instance=p, target_epc=new_epc, total_cost=total_cost + ) property_value_increase_ranges[p.id] = valuations if p.is_new: @@ -844,6 +1033,7 @@ async def trigger_plan(body: PlanTriggerRequest): # Commit final changes session.commit() + except IntegrityError: logger.error("Database integrity error occurred", exc_info=True) session.rollback() diff --git a/backend/app/plan/schemas.py b/backend/app/plan/schemas.py index f84912fe..5db3d4d1 100644 --- a/backend/app/plan/schemas.py +++ b/backend/app/plan/schemas.py @@ -37,6 +37,7 @@ MEASURE_MAP = { VALID_GOALS = ["Increasing EPC"] VALID_HOUSING_TYPES = ["Social", "Private"] +VALID_EVENT_TYPES = ["remote_assessment"] # Define the validation function for inclusions/exclusions @@ -56,10 +57,16 @@ def check_housing_type(value: str) -> str: return value +def check_event_type(value: str) -> str: + assert value in VALID_EVENT_TYPES, f"{value} is not a valid event type" + return value + + # Use Annotated with BeforeValidator for each list item validation InclusionOrExclusionItem = Annotated[str, BeforeValidator(check_inclusion_or_exclusion)] Goal = Annotated[str, BeforeValidator(check_goals)] HousingType = Annotated[str, BeforeValidator(check_housing_type)] +EventType = Annotated[str, BeforeValidator(check_event_type)] class PlanTriggerRequest(BaseModel): @@ -75,8 +82,17 @@ class PlanTriggerRequest(BaseModel): valuation_file_path: Optional[str] = None exclusions: Optional[List[InclusionOrExclusionItem]] = Field(default=None, min_length=1) inclusions: Optional[List[InclusionOrExclusionItem]] = Field(default=None, min_length=1) + # This is a list of measures that we want to be included, if they are options + # Default to empty + required_measures: Optional[List[InclusionOrExclusionItem]] = Field(default=[], min_length=1) scenario_name: Optional[str] = "" multi_plan: Optional[bool] = False optimise: Optional[bool] = True default_u_values: Optional[bool] = True + + ashp_cop: Optional[float] = 2.8 + + # When performing a remote assessment, if this has been set, it will allow the engine to + # pull data from the find my epc website, to utilise as part of a remote assessment + event_type: Optional[float] = "remote_assessment", diff --git a/backend/app/plan/utils.py b/backend/app/plan/utils.py index 07d4642d..34fb02e7 100644 --- a/backend/app/plan/utils.py +++ b/backend/app/plan/utils.py @@ -1,9 +1,5 @@ -import pandas as pd -from backend.Property import Property from utils.s3 import read_from_s3 -from recommendations.recommendation_utils import get_wall_u_value, get_floor_u_value, get_roof_u_value - from backend.app.config import get_settings import msgpack diff --git a/backend/ml_models/AnnualBillSavings.py b/backend/ml_models/AnnualBillSavings.py index 211e5ea6..b22837d8 100644 --- a/backend/ml_models/AnnualBillSavings.py +++ b/backend/ml_models/AnnualBillSavings.py @@ -28,8 +28,8 @@ class AnnualBillSavings: # Latest price cap figures from Ofgem are for April 2024 # https://www.ofgem.gov.uk/energy-price-cap - ELECTRICITY_PRICE_CAP = 0.2236 - GAS_PRICE_CAP = 0.0548 + ELECTRICITY_PRICE_CAP = 0.2486 + GAS_PRICE_CAP = 0.0634 # This is the most recent export payment figure, at 9.28p/kWh # Smart export guarantee rates can be found here: # https://www.sunsave.energy/solar-panels-advice/exporting-to-the-grid/best-seg-rates @@ -39,8 +39,8 @@ class AnnualBillSavings: PRICE_FACTOR = 0.09549999999999999 # Daily standard charge, based on average across England, Scotland and Wales, and includes VAT - DAILY_STANDARD_CHARGE_GAS = 0.3143 - DAILY_STANDARD_CHARGE_ELECTRICITY = 0.601 + DAILY_STANDARD_CHARGE_GAS = 0.3165 + DAILY_STANDARD_CHARGE_ELECTRICITY = 0.6097 # Based on https://www.nottenergy.com/advice-and-tools/project-energy-cost-comparison # For July 2024. These quotes are based on the east midlands region, so we diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py index 92c55641..6d4852b2 100644 --- a/backend/ml_models/Valuation.py +++ b/backend/ml_models/Valuation.py @@ -1,5 +1,4 @@ import numpy as np -from scipy.constants import value class PropertyValuation: @@ -203,12 +202,43 @@ class PropertyValuation: return msm_increase, lloyds_increase @classmethod - def estimate(cls, property_instance, target_epc): + def estimate(cls, property_instance, target_epc, total_cost=None): + """ + This function estimates the value of a property based on the current EPC rating and the target EPC rating + :param property_instance: An instance of the Property class + :param target_epc: The target EPC rating + :param total_cost: The total cost of the retrofit + :return: + """ current_value = ( property_instance.valuation if property_instance.valuation else cls.UPRN_VALUE_LOOKUP.get(property_instance.uprn) ) + current_epc = property_instance.data["current-energy-rating"] + + if not current_value: + return { + "current_value": 0, + "lower_bound_increased_value": 0, + "upper_bound_increased_value": 0, + "average_increased_value": 0, + "average_increase": 0 + } + + return cls.estimate_valuation_improvement(current_value, current_epc, target_epc, total_cost) + + @classmethod + def estimate_valuation_improvement(cls, current_value, current_epc, target_epc, total_cost=None): + """ + This function estimates the value of a property based on the current EPC rating and the target EPC rating + :param current_value: + :param current_epc: + :param target_epc: + :param total_cost: + :return: + """ + if not current_value: return { "current_value": 0, @@ -218,7 +248,6 @@ class PropertyValuation: "average_increase": 0 } - current_epc = property_instance.data["current-energy-rating"] # We get the spectrum of ratings between the current and target EPC epc_band_range = cls.EPC_BANDS[cls.EPC_BANDS.index(current_epc): cls.EPC_BANDS.index(target_epc) + 1] @@ -242,6 +271,19 @@ class PropertyValuation: avg_increase = np.mean(all_increases) + if total_cost is not None: + # We CAP the retrofit ROI at 2 + avg_increase_value = current_value * avg_increase + if avg_increase_value / total_cost > 2: + # We re-scale the % so that the average value increase is no more than 2 times the total cost + double_cost = 2 * total_cost + new_avg_increase = double_cost / current_value + scalar = new_avg_increase / avg_increase + # We scale the min and max increases by the same scalar + min_increase *= scalar + max_increase *= scalar + avg_increase = new_avg_increase + return { "current_value": current_value, "lower_bound_increased_value": float(current_value * (1 + min_increase)), diff --git a/backend/ml_models/api.py b/backend/ml_models/api.py index c2f2dcd9..c108f1b7 100644 --- a/backend/ml_models/api.py +++ b/backend/ml_models/api.py @@ -39,6 +39,7 @@ class ModelApi: timestamp, prediction_buckets, base_url="https://api.dev.hestia.homes", + max_retries=2, ): """ This class handles the communication with the Model APIs. These models include SAP change, heat demain change @@ -54,6 +55,8 @@ class ModelApi: self.timestamp = timestamp self.prediction_buckets = prediction_buckets + self.max_retries = max_retries + @staticmethod def predictions_template(): return { @@ -295,15 +298,33 @@ class ModelApi: async def run_batches(): for chunk in tqdm(to_loop_over, total=len(to_loop_over)): - predictions_dict = await self.predict_all_async( - df=data.iloc[chunk:chunk + batch_size], - bucket=bucket, - model_prefixes=model_prefixes, - extract_ids=extract_ids - ) - for key, scored in predictions_dict.items(): - all_predictions[key] = pd.concat([all_predictions[key], scored]) + attempts = 0 + success = False + while attempts <= self.max_retries and not success: + try: + predictions_dict = await self.predict_all_async( + df=data.iloc[chunk:chunk + batch_size], + bucket=bucket, + model_prefixes=model_prefixes, + extract_ids=extract_ids + ) + + for key, scored in predictions_dict.items(): + all_predictions[key] = pd.concat([all_predictions[key], scored]) + + success = True + except Exception as e: + attempts += 1 + logger.error( + f"Batch {chunk}-{chunk + batch_size} failed (Attempt {attempts}/{self.max_retries}). " + f"Error: {e}" + ) + + if attempts > self.max_retries: + logger.error( + f"Skipping batch {chunk}-{chunk + batch_size} after {self.max_retries} failed attempts." + ) # Check if there is an existing event loop try: diff --git a/backend/requirements/requirements.txt b/backend/requirements/requirements.txt index dd5c34ca..577776be 100644 --- a/backend/requirements/requirements.txt +++ b/backend/requirements/requirements.txt @@ -29,3 +29,5 @@ mip==1.15.0 pyarrow==17.0.0 fastparquet==2024.5.0 aiohttp==3.10.10 +# find my epc +beautifulsoup4 diff --git a/backend/tests/test_search_epc.py b/backend/tests/test_search_epc.py new file mode 100644 index 00000000..562585ad --- /dev/null +++ b/backend/tests/test_search_epc.py @@ -0,0 +1,59 @@ +import pytest +import os +from backend.SearchEpc import SearchEpc # Replace with your actual module name +from dotenv import load_dotenv + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +class TestSearchEpcIntegration: + @pytest.mark.parametrize( + "address, postcode, uprn, skip_os, expected_partial_address", + [ + # Test case 1: Valid address and postcode, skipping OS + # In this case, the property is an individual flat but the uprn associated to the + # EPC is for the building as a whole, possibly because there was a conversion of sorts + ("Garden Flat, 48 Bedminster Parade", "BS3 4HS", 308249, True, + "260907a5431fa073d193cc6bbec51fbf1ba9a61845ab2503f85aa19ce3ed6afd", 1), + + # Test case 2: Another valid address and postcode + # In this case, the newest EPC, does not have a uprn associated to it. If we did a search by + # uprn, we would get an old EPC + ("Flat 8, Hainton House", "DN32 9AQ", 10090082018, True, + "bd1149a20a73397184f07a9955f872424826e70f4870c058d71be887766ee1f8", 3), + + ], + ) + def test_find_property(self, address, postcode, uprn, skip_os, lmk_key, n_old_epcs): + """ + Integration test for `find_property`, making actual API calls. + """ + # Provide your actual API keys or tokens here + os_api_key = "" + + # Initialize the SearchEpc instance + epc_searcher = SearchEpc( + address1=address, + postcode=postcode, + uprn=uprn, + auth_token=EPC_AUTH_TOKEN, + os_api_key=os_api_key, + ) + + # Execute the method + epc_searcher.find_property(skip_os=skip_os) + + # We check that we have the correct epc + assert epc_searcher.newest_epc["lmk-key"] == lmk_key + assert epc_searcher.newest_epc["uprn"] == uprn + assert len(epc_searcher.older_epcs) == n_old_epcs + + def test_search_housenumber(self): + eg1 = 'Flat A11, Mortimer House, Grendon Road, Exeter' + res1 = SearchEpc.get_house_number(eg1, None) + assert res1 == "A11" + + eg2 = 'Flat A9, Mortimer House, Grendon Road, Exeter, EX1 2NL' + res2 = SearchEpc.get_house_number(eg2, None) + assert res2 == "A9" diff --git a/etl/access_reporting/app.py b/etl/access_reporting/app.py new file mode 100644 index 00000000..8a8254a1 --- /dev/null +++ b/etl/access_reporting/app.py @@ -0,0 +1,440 @@ +import os +from msal import ConfidentialClientApplication +from datetime import datetime, timedelta +import requests +from functools import wraps +import time +import logging +from io import BytesIO +import pandas as pd + +# Configure logging +logger = logging.getLogger(__name__) +if not logger.handlers: + handler = logging.StreamHandler() + formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) +logger.setLevel(logging.INFO) + + +def handle_error(response): + """ + Handle errors based on HTTP status codes and log detailed information. + """ + try: + error_json = response.json().get('error', {}) + except ValueError: + error_json = {} + + error_code = error_json.get('code', 'unknownError') + error_message = error_json.get('message', 'No detailed error message provided.') + inner_error = error_json.get('innererror', {}) + details = error_json.get('details', []) + + logger.error(f"Error Code: {error_code}") + logger.error(f"Error Message: {error_message}") + if inner_error: + logger.error(f"Inner Error: {inner_error}") + if details: + logger.error(f"Error Details: {details}") + + if response.status_code == 401: + logger.error("Unauthorized. Token might be invalid.") + elif response.status_code == 403: + logger.error("Forbidden. Access denied to the requested resource.") + elif response.status_code == 404: + logger.error("Not Found. The requested resource doesn’t exist.") + elif response.status_code == 429: + retry_after = int(response.headers.get('Retry-After', 5)) # Default to 5 seconds if not provided + logger.warning(f"Too Many Requests. Retrying after {retry_after} seconds...") + time.sleep(retry_after) + return 'retry' + elif response.status_code in (500, 503): + retry_after = int(response.headers.get('Retry-After', 5)) # Default to 5 seconds if not provided + logger.error(f"Server error. Retrying after {retry_after} seconds...") + time.sleep(retry_after) + return 'retry' + else: + raise ValueError(f"API request failed with status code {response.status_code} - {error_message}") + + raise ValueError(f"API request failed with status code {response.status_code} - {error_message}") + + +def api_call_decorator(func): + """ + Handles various aspects of the API call, including refreshing the access token if needed and handling pagination. + :param func: The function to be decorated. + :return: The wrapped function. + """ + + @wraps(func) + def wrapper(self, *args, **kwargs): + try: + # Check and refresh the access token if needed + if self.is_access_token_expired(): + self.retrieve_access_token() + logger.info("Access token refreshed.") + + # Get the HTTP method, URL, and optionally data from the function + http_method, url, data = func(self, *args, **kwargs) + + # Initialize the results list and handle pagination if page_size is provided + results = [] + page_size = kwargs.get('page_size', None) + response_data = {} + n_calls = 0 + + while url: + logger.info("Making call for page: " + str(n_calls + 1)) + n_calls += 1 + response = requests.request(http_method, url, headers=self.headers, json=data) + + # Handle the response + if response.status_code == 200: + response_json = response.json() # Store the response JSON + if page_size: + results.extend(response_json.get('value', [])) + url = response_json.get('@odata.nextLink', None) + logger.info(f"Next page URL: {url}") + else: + response_data = response_json # Capture the full response for consistency + break + else: + retry = handle_error(response) + if retry == 'retry': + continue + + if page_size: + response_data = {'value': results} + + return response_data + + except Exception as e: + logger.exception("An error occurred during the API call.") + raise e + + return wrapper + + +class SharePointClient: + access_token = None + access_token_request_timestamp = None + access_token_expiry = None + headers = None + + TIMESTAMP_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ" + + def __init__(self, tenant_id, client_id, client_secret, site_id, access_token=None, + access_token_expiration_details=None): + """ + Initializes the SharePointClient with necessary credentials and site information. + :param tenant_id: The tenant ID. + :param client_id: The client ID. + :param client_secret: The client secret. + :param site_id: The site ID. + :param access_token: The access token (optional) + :param access_token_expiration_details: The access token expiration details (optional) + """ + self.tenant_id = tenant_id + self.client_id = client_id + self.client_secret = client_secret + + if access_token: + if not access_token_expiration_details: + raise ValueError("Access token expiration details must be provided.") + self.access_token = access_token + self.set_access_token_expiration_details(access_token_expiration_details) + self.headers = { + 'Authorization': f"Bearer {self.access_token['access_token']}" + } + else: + self.retrieve_access_token() + + # Retrieve static identifiers + self.site_id = site_id + self.document_drive = self.get_documents_drive() + + def get_token_expiration_details(self): + """ + Returns the access token expiration details. Converts the datetime objects to strings for serialization. + :return: + """ + return { + 'access_token_request_timestamp': datetime.strftime( + self.access_token_request_timestamp, self.TIMESTAMP_FORMAT + ), + 'access_token_expiry': datetime.strftime(self.access_token_expiry, self.TIMESTAMP_FORMAT) + } + + def set_access_token_expiration_details(self, access_token_expiration_details): + """ + Sets the access token expiration details from a serialized dictionary. + :param access_token_expiration_details: The serialized access token expiration details. + :return: + """ + self.access_token_request_timestamp = datetime.strptime( + access_token_expiration_details['access_token_request_timestamp'], self.TIMESTAMP_FORMAT + ) + self.access_token_expiry = datetime.strptime( + access_token_expiration_details['access_token_expiry'], self.TIMESTAMP_FORMAT + ) + + def is_access_token_expired(self): + """ + Checks if the access token has expired. If it has, a new access token is retrieved. + :return: True if expired, False otherwise. + """ + return datetime.now() >= self.access_token_expiry + + def retrieve_access_token(self, refresh=False): + """ + Implements authentication using MSAL. + :param refresh: If True, force a refresh of the access token. + :return: None + """ + app = ConfidentialClientApplication( + self.client_id, + authority=f"https://login.microsoftonline.com/{self.tenant_id}", + client_credential=self.client_secret + ) + + scope = ["https://graph.microsoft.com/.default"] + + access_token_request_timestamp = datetime.now() + + if refresh: + logger.info("Forcing refresh of access token.") + token = app.acquire_token_for_client(scopes=scope) + else: + # Check if a token is already cached + token = app.acquire_token_silent(scope, account=None) + + if not token: + token = app.acquire_token_for_client(scopes=scope) + + if "access_token" not in token: + logger.error("Authentication failed.") + raise ValueError("Authentication failed") + + access_token_expiry = access_token_request_timestamp + timedelta( + seconds=token['expires_in'] - 20 + ) + + self.access_token = token + self.access_token_request_timestamp = access_token_request_timestamp + self.access_token_expiry = access_token_expiry + self.headers = { + 'Authorization': f"Bearer {self.access_token['access_token']}" + } + + logger.info("Access token retrieved successfully.") + + @api_call_decorator + def get_documents_drive(self): + """ + Get the document drive of the SharePoint site. + :return: Tuple containing HTTP method, URL, and None for data. + """ + url = f"https://graph.microsoft.com/v1.0/sites/{self.site_id}/drive" + logger.info(f"Getting document drive from URL: {url}") + return 'GET', url, None + + @api_call_decorator + def list_folder_contents(self, drive_id, folder_path: str, page_size: int = 100): + """ + This function will list the contents of a folder in SharePoint. + :param drive_id: The ID of the drive. + :param folder_path: The path of the folder. + :param page_size: The number of items per page (default is 100). + :return: Tuple containing HTTP method, URL, and None for data. + """ + url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root:/{folder_path}:/children?$top={page_size}" + logger.info(f"Listing folder contents from URL: {url}") + return 'GET', url, None + + @staticmethod + def download_sharepoint_file(download_url): + """ + Downloads a file from the given URL and returns its content. + + :param download_url: The URL to download the file from. + :return: The content of the downloaded file. + """ + response = requests.get(download_url, stream=True) + response.raise_for_status() # Check if the request was successful + + file_content = BytesIO() + + # Read the file content into memory + for chunk in response.iter_content(chunk_size=8192): + file_content.write(chunk) + + file_content.seek(0) # Reset the file pointer to the beginning + + return file_content + + def download_sharepoint_folder(self, drive_id, folder_path, download_dir, excluded_file_types=None): + """ + Downloads all files in a SharePoint folder to the specified local directory. + + :param drive_id: The ID of the SharePoint drive. + :param folder_path: The path of the folder in SharePoint. + :param download_dir: The local directory to save the downloaded files. + :param excluded_file_types: A list of file types to exclude from download (default is None). + """ + + excluded_file_types = [] if excluded_file_types is None else excluded_file_types + + # Ensure the download directory exists + os.makedirs(download_dir, exist_ok=True) + + # List folder contents + folder_contents = self.list_folder_contents(drive_id, folder_path) + files = folder_contents.get('value', []) + + for item in files: + if item.get('folder'): # Check if it's a folder + # Recursively handle subfolders + subfolder_path = f"{folder_path}/{item['name']}" + subfolder_dir = os.path.join(download_dir, item['name']) + self.download_sharepoint_folder(drive_id, subfolder_path, subfolder_dir) + else: + # It's a file, download it + file_name = item['name'] + if file_name.split(".")[-1] in excluded_file_types: + continue + download_url = item['@microsoft.graph.downloadUrl'] + + logger.info(f"Downloading file: {file_name}") + file_content = self.download_sharepoint_file(download_url) + + # Save the file locally + file_path = os.path.join(download_dir, file_name) + with open(file_path, 'wb') as f: + f.write(file_content.read()) + + logger.info(f"File saved to: {file_path}") + + +def app(): + # Customers for WC 18/11/2024 + # + # ----- Eastlight location ----- + # No data this week, low on data + # Housing Associations/Eastlight/Survey Outcomes/ + # + # ----- Settle location ----- + # No data this week, in separate files + # Housing Associations/Settle/Survey Outcomes/ + # + # ----- Community Housing ----- + # In separate files - will we get to a singular form? + # Housing Associations/Community Housing/Survey Outcomes/ + # + # ----- ACIS location ----- + # Doesn't have this week's data + # Housing Asociation/ACIS/Survey Outcomes/ACIS Group - 25.11.2024 - USE THIS.xlsx + # + # ----- Southern location ----- + # + # + # ------ Unitas location ------ + # Does have this week's data + # Unitas location: Housing Associations/Unitas/Survey Outcomes/Unitas.xlsx + + locations = { + "Unitas": "Housing Associations/Unitas/Survey Outcomes/Unitas.xlsx", + "Eastlight": "Housing Associations/Eastlight/Survey Outcomes/", + "Settle": "Housing Associations/Settle/Survey Outcomes/", + "Community Housing": "Housing Associations/Community Housing/Survey Outcomes/", + "ACIS": "Housing Asociation/ACIS/Survey Outcomes/ACIS Group - 25.11.2024 - USE THIS.xlsx", + "Southern": None, + } + + SHAREPOINT_CLIENT_ID = os.getenv("SHAREPOINT_CLIENT_ID", None) + SHAREPOINT_CLIENT_SECRET = os.getenv("SHAREPOINT_CLIENT_SECRET", None) + SHAREPOINT_TENANT_ID = os.getenv("SHAREPOINT_TENANT_ID", None) + WARMFRONT_SHAREPOINT_SITE_ID = os.getenv("WARMFRONT_SHAREPOINT_SITE_ID", None) + + sharepoint_client = SharePointClient( + tenant_id=SHAREPOINT_TENANT_ID, + client_id=SHAREPOINT_CLIENT_ID, + client_secret=SHAREPOINT_CLIENT_SECRET, + site_id=WARMFRONT_SHAREPOINT_SITE_ID + ) + + results = [] + for customer, location in locations.items(): + if location is None: + continue + + if location.endswith(".xlsx"): + # Read in the file + # List the contents of the folder + location_folder = os.path.dirname(location) + contents = sharepoint_client.list_folder_contents( + drive_id=sharepoint_client.document_drive["id"], + folder_path=location_folder + ) + filepaths = contents["value"] + + download_url = next( + (file['@microsoft.graph.downloadUrl'] for file in filepaths + if '@microsoft.graph.downloadUrl' in file and file['name'] == os.path.basename(location)), + None + ) + + if download_url is None: + raise ValueError("File not found in the SharePoint folder.") + + file_content = sharepoint_client.download_sharepoint_file(download_url) + + # Convert to pandas dataframe since file is an excel file + df = pd.read_excel(file_content) + df["Outcome"] = df["Outcome"].str.strip().str.lower() + + # We cannot group by funding type accurately because any job that is not funded will have a NaN value + # and therefore we have a 100% acces rate for funded jobs and 0% otherwise + surveyor_outcomes = [] + for (week, surveyor, funding), group in df.groupby(["Week Commencing", "DEA/REA"]): + funding_type = [x for x in group["Funding Type"].unique() if not pd.isnull(x)] + if funding_type: + funding_type = " + ".join(funding_type) + else: + funding_type = "No Funding" + surveyed = group[group["Outcome"] == "surveyed"] + no_answer = group[ + group["Outcome"] == "no answer" + ] + other_issue = group[~group["Outcome"].isin(["surveyed", "no answer"])] + + surveyor_outcomes.append( + { + "Surveyor": surveyor, + "Week": week, + "Funding": funding_type, + "Surveyed": surveyed.shape[0], + "No Answer": no_answer.shape[0], + "Other Issue": other_issue.shape[0], + } + ) + + surveyor_outcomes = pd.DataFrame(surveyor_outcomes) + surveyor_outcomes["Week"] = pd.to_datetime(surveyor_outcomes["Week"]) + + weekly_access = ( + surveyor_outcomes.drop(columns=["Surveyor"]).groupby(["Week", "Funding"]).sum().reset_index() + ) + # Sort by week and surveyor ascending + surveyor_outcomes = surveyor_outcomes.sort_values(["Week", "Surveyor"], ascending=[True, True]) + surveyor_outcomes["Access Rate"] = 100 * surveyor_outcomes["Surveyed"] / ( + surveyor_outcomes["Surveyed"] + surveyor_outcomes["No Answer"] + surveyor_outcomes["Other Issue"] + ) + + weekly_access["Total"] = ( + weekly_access["Surveyed"] + weekly_access["No Answer"] + weekly_access["Other Issue"] + ) + weekly_access["Access Rate"] = 100 * weekly_access["Surveyed"] / ( + weekly_access["Surveyed"] + weekly_access["No Answer"] + weekly_access["Other Issue"] + ) diff --git a/etl/access_reporting/requirements.txt b/etl/access_reporting/requirements.txt new file mode 100644 index 00000000..8e6dbb08 --- /dev/null +++ b/etl/access_reporting/requirements.txt @@ -0,0 +1,11 @@ +python-docx==0.8.11 +PyPDF2==3.0.1 +boto3 +requests +pandas +pyarrow==12.0.1 +openpyxl==3.1.2 +usaddress==0.5.10 +pdfplumber==0.10.3 +msgpack==1.0.5 +msal \ No newline at end of file diff --git a/etl/costs/app.py b/etl/costs/app.py index 797191d2..f2bf365b 100644 --- a/etl/costs/app.py +++ b/etl/costs/app.py @@ -11,7 +11,7 @@ import inspect src_file_path = inspect.getfile(lambda: None) -DATA_DIRECTORY = Path(src_file_path).parent / "local_data" / "20240917 Hestia Materials.xlsx" +DATA_DIRECTORY = Path(src_file_path).parent / "local_data" / "20250316 Domna Materials.xlsx" # Environment file is at the same level as this file ENV_FILE = Path(src_file_path).parent / "etl" / "costs" / ".env" dotenv.load_dotenv(ENV_FILE) @@ -91,6 +91,7 @@ def app(): lel_costs = pd.read_excel(DATA_DIRECTORY, sheet_name="low_energy_lighting", header=0) flat_roof_costs = pd.read_excel(DATA_DIRECTORY, sheet_name="flat_roof_insulation", header=0) window_costs = pd.read_excel(DATA_DIRECTORY, sheet_name="window_glazing", header=0) + rir_insulation_costs = pd.read_excel(DATA_DIRECTORY, sheet_name="room_roof_insulation", header=0) # Form a single table to be uploaded costs = pd.concat( @@ -104,7 +105,8 @@ def app(): ewi_costs, lel_costs, flat_roof_costs, - window_costs + window_costs, + rir_insulation_costs, ] ) diff --git a/etl/customers/aiha/bid_numbers.py b/etl/customers/aiha/bid_numbers.py new file mode 100644 index 00000000..b371e2e5 --- /dev/null +++ b/etl/customers/aiha/bid_numbers.py @@ -0,0 +1,106 @@ +""" +This is an adhoc script, used to pull together some of the figures that are being included in the +Warm Homes: Social Housing Wave 3 funding application +""" + +import pandas as pd +import numpy as np + +aiha_all_units = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/AIHA Measures Packages 2024_11_13.xlsx", + sheet_name="All Properties - AIHA", + header=2 +) +modelled_units = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/AIHA Measures Packages 2024_11_13.xlsx", + sheet_name="Modelled Properties - Measures", + header=5 +) +aiha_all_units = aiha_all_units.drop(columns=['Unnamed: 0', 'Unnamed: 1']) +aiha_extracted_property_data = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/extracted_property_data.csv" +) +aiha_wave_3_units = aiha_all_units[aiha_all_units["Expected Package Cost"].astype(float) > 0] +# TODO: The EPC C property isn't a C! +aiha_epc_breakdown = aiha_wave_3_units["Expected EPC Rating"].replace({"D or E": "E"}).value_counts() +# For CAHA +caha_epc_breakdown = modelled_units[ + modelled_units['Survey Key'].str.contains("CAHA") +]['Current EPC Rating'].value_counts() +# For Hornsey +hornsey_epc_breakdown = modelled_units[ + modelled_units['Survey Key'].str.contains("HORNSEY") +]['Current EPC Rating'].value_counts() + +aiha_original_asset_data = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/240924- KSQ & Domna Info Merge - AIHA - SHDF Wave 3 " + "bid - Supplementary information.xlsx", + sheet_name="Archetyping Data", + header=2 +) + +# Get the units in the bid: +aiha_wave_3_features = aiha_original_asset_data[ + ['Address letter or number', 'Street address', 'Postcode', "Wall type", + "Property type", "built-form", "floor"] +].merge( + aiha_wave_3_units[['Address letter or number', 'Street address', 'Postcode']], + how="inner", + on=["Address letter or number", "Street address", "Postcode"] +) + +wall_type_breakdown = aiha_wave_3_features["Wall type"].value_counts() +property_type_breakdown = aiha_wave_3_features.groupby(["Property type", "floor"]).size().reset_index() + +aiha_wave_3_features[aiha_wave_3_features["Property type"] == "Flat"][["Street address", "Postcode"]] + +# 4 Yetev Lev Court  ... Semi-Detached mid - Medium +# B 86 Bethune Road ... Mid-Terrace top. - Low +# A 80 Bethune Road ... Mid-Terrace ground. - Low +# B 80 Bethune Road ... \n \n - Low +# A 9 Clapton Common ... Semi-Detached ground. - Low +# C 9 Clapton Common ... End-Terrace \n. - Low +# B 89 Manor Road ... \n \n. - Low +# A 6 Northfield Road ... Detached top. - Low +# 13 Northfield Rd ... Semi-Detached \n - Low +# A 73 Manor Road ... End-Terrace \n - Low +# B 73 Manor Road ... Detached top - Low + +# Hornsey data - contained in original asset list +hornsey_asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/SHDF - Template - EOI - Hornsey Housing " + "Trust.xlsx", + sheet_name="Ksquared-All units information", + header=3 +) + +# We don't need the first row +hornsey_asset_list = hornsey_asset_list.iloc[1:] +# Fill NA values with empty strings +hornsey_asset_list = hornsey_asset_list.fillna("") +hornsey_asset_list["Address letter or number"] = hornsey_asset_list["Address letter or number"].astype( + str +).str.strip() +hornsey_asset_list["Postcode"] = hornsey_asset_list["Postcode"].astype(str).str.strip() +hornsey_asset_list["Street address"] = hornsey_asset_list["Street address"].astype(str).str.strip() +# Replace double spaces +for col in ["Address letter or number", "Street address", "Postcode"]: + hornsey_asset_list[col] = hornsey_asset_list[col].str.replace(" ", " ") + +hornsey_asset_list = hornsey_asset_list[hornsey_asset_list["Address letter or number"] != ""] + +hornsey_asset_list["Wall Type Cleaned"] = np.where( + hornsey_asset_list["Wall type"].str.contains("Cavity"), + "Cavity", + "Solid" +) + +hornsey_asset_list["Property type"].value_counts() + +# CAHA +caha_epc_data = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/caha_extracted_property_data.xlsx" +) + +caha_epc_data[caha_epc_data["address"] != "33 Woodhouse Road"]["property_type"].value_counts() +caha_epc_data[caha_epc_data["address"] != "33 Woodhouse Road"]["wall_type"].value_counts() diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py new file mode 100644 index 00000000..44baef80 --- /dev/null +++ b/etl/customers/aiha/xml_extraction.py @@ -0,0 +1,988 @@ +import os +from io import BytesIO + +import pandas as pd + +from etl.xml_survey_extraction.XmlParser import XmlParser + +SURVEY_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/RESIDENT SURVEYS" +CONTINGENCY_RATE = 0.26 + + +def sap_to_epc(sap_points: int | float): + """ + Simple utility function to convert SAP points to EPC rating. + :param sap_points: numerical value of SAP points, typically between 0 and 100 + :return: + """ + + if sap_points <= 0: + raise ValueError("SAP points should be above 0.") + + if sap_points >= 92: + return "A" + elif sap_points >= 81: + return "B" + elif sap_points >= 69: + return "C" + elif sap_points >= 55: + return "D" + elif sap_points >= 39: + return "E" + elif sap_points >= 21: + return "F" + else: + return "G" + + +def main(): + """ + This script handles the extraction of data from the XML files in the survey folders. + :return: + """ + # Step 1: List all subfolders inside SURVEY_FOLDER_PATH. + subfolders = [f.path for f in os.scandir(SURVEY_FOLDER_PATH) if f.is_dir()] + + # Step 2: Loop through each subfolder and find the XML files. + extracted_surveys = [] + for subfolder in subfolders: + print(f"Searching in subfolder: {subfolder}") + + # Find all XML files in the current subfolder. + xml_files = [f for f in os.listdir(subfolder) if f.endswith('.xml')] + + if not xml_files: + print(f"No XML files found in subfolder: {subfolder}") + continue + + # If any XML files are found, perform the data extraction. We use the subfolder name as the survey key. + for xml_file in xml_files: + xml_path = os.path.join(subfolder, xml_file) + print(f"Processing XML file: {xml_path}") + + # Read in the XML and parse it using the XmlParser class. + with open(xml_path, 'rb') as file: + xml_data_io = BytesIO(file.read()) + uprn = None # Set the UPRN if available. + + # Create an XmlParser instance + xml_parser = XmlParser( + file=xml_data_io, + filekey=xml_path, + surveyor_company="", + uprn=uprn, + ) + + # Run the parser to extract the data + xml_parser.run() + if not xml_parser.epc: + # If we don't have a lig xml + continue + + # Store the extracted data for further processing + extracted_surveys.append({ + "survey_key": subfolder.split("/")[-1], + **xml_parser.epc, + **xml_parser.additional_data + }) + + print(f"Extracted {len(extracted_surveys)} surveys.") + # Process the extracted_surveys as needed, for example, save to a database or write to a file. + extracted_surveys = pd.DataFrame(extracted_surveys) + + # THis is the data we need for the AIHA project + measures_data = extracted_surveys[ + ["survey_key", "address", "postcode", "current-energy-efficiency", "current-energy-rating", + "number_of_floors", "walls-description", "property-type", "built-form"] + ] + measures_data = measures_data.sort_values("survey_key", ascending=True) + measures_data.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/extracted_property_data.csv", + ) + + # Note: + # The properties will still have "Very poor" ratings for their hot water + + # TODO + # - AIH001-03 has a loft that is inaccessible - ask Chenai about why this property didn't have access to the loft + # [Can't remember, not clear - Chenai will check] + # - AIH001-08 and AIH001-09, check if it's freehold - could solar work as both of these units are part of the same + # buulding [Question for Lewis & Kevin] + # - AIH001-09 - Is it not possible to install a loft hatch? [IT IS NOT, NO ACCESS - would need to accessed from + # the other unit] + # - AIH001-09 - Is there definitely an immersion water heater? Is this definitely the case for the other units? + # [Question for Lewis & Kevin] - [YES - ASHP!!!!] + + # TODO: Check which properties are in a conservation area + # TODO: AIH001-16 - Is the loft insulation suitable (already has 100mm in the RIR) + # TODO: Adjust Archetype 14 homes to exclude double glazing? Or should we exclude entirely + + recommended_measures = [ + { + "survey_key": "AIH001-01", + "starting_sap": 69, + "recommended_measures": [], + "notes": "Is EPC C" + }, + { + "survey_key": "AIH001-02", + "starting_sap": 65, + "recommended_measures": [ + { + "measure": "Solar PV", + "description": "2.4kWp Solar PV system", + "config": [ + { + "size": "2.4W", + "orientation": "Horizontal", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 7, + "ending_sap": 72, + "notes": "The array can be mounted on the flat roof, so that panels are south facing" + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 4, + "ending_sap": 76 + } + ], + }, + { + "survey_key": "AIH001-03", + "starting_sap": 43, + "recommended_measures": [ + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 1, + "ending_sap": 44, + }, + { + "measure": "Solar PV", + "description": "4kWp Solar PV system", + "config": [ + { + "size": "4kWp", + "orientation": "East", + "elavation": 30, + "overshading": "None or little", + }, + ], + "sap_points": 10, + "ending_sap": 54 + }, + { + "measure": "Air Source Heat Pump", + "description": "Ecoforest ecoAIR EVI 4-20 20kW air source heat pump (+TTZC)", + "sap_points": 20, + "ending_sap": 74 + }, + { + "measure": "Tariff Review", + "description": "Switch to 24-hour tariff", + "sap_points": 15, + "ending_sap": 89 + } + ], + "notes": "Unclear if the loft is accessible" + }, + { + "survey_key": "AIH001-04", + "starting_sap": 48, + "recommended_measures": [ + { + "measure": "Flat Roof Insulation", + "description": "100mm flat roof insulation", + "floor_area": 39.1482, # based on area of top floor + "sap_points": 4, + "ending_sap": 52 + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 3, + "ending_sap": 55 + }, + { + "measure": "Solar PV", + "description": "4kWp Solar PV system", + "config": [ + { + "size": "4kWp", + "orientation": "South", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 15, + "ending_sap": 70 + } + ], + "notes": "Roof is flat, PV array should be installed south facing with elevation" + }, + { + "survey_key": "AIH001-05", + "starting_sap": 54, + "recommended_measures": [ + { + "measure": "Flat Roof Insulation", + "description": "100mm flat roof insulation", + "floor_area": 49.48, # based on area of top floor + "sap_points": 5, + "ending_sap": 59, + }, + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 2, + "ending_sap": 61, + }, + { + "measure": "Solar PV", + "description": "4kWp Solar PV system", + "config": [ + { + "size": "4kW", + "orientation": "Horizontal", + "elavation": 30, + "overshading": "Modest", + } + ], + "sap_points": 9, + "ending_sap": 70 + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 3, + "ending_sap": 73 + } + ], + "notes": "" + }, + { + "survey_key": "AIH001-06", + "starting_sap": 62, + "recommended_measures": [ + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 2, + "ending_sap": 64, + }, + { + "measure": "Solar PV", + "description": "2kWp Solar PV system", + "config": [ + { + "size": "2kW", + "orientation": "South", + "elavation": 30, + "overshading": "Modest", + } + ], + "sap_points": 6, + "ending_sap": 70 + } + ] + }, + { + "survey_key": "AIH001-07", + "starting_sap": 74, + "recommended_measures": [], + "notes": "Is EPC C" + }, + { + "survey_key": "AIH001-08", + "starting_sap": 56, + "recommended_measures": [ + { + "measure": "Loft Insulation", + "description": "300mm loft insulation", + "floor_area": 54.2864, # Based on area of top floor + "sap_points": 2, + "ending_sap": 58, + }, + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 4, + "ending_sap": 62, + }, + { + "measure": "Internal Wall Insulation", + "description": "100mm internal wall insulation", + "hlp": 24.13 * 2.63, + "sap_points": 7, + "ending_sap": 69, + }, + { + "measure": "Ventilation", + "description": "2x DMEV fans", + "sap_points": 0, + "ending_sap": 69, + } + ] + }, + { + "survey_key": "AIH001-09", + "starting_sap": 44, + "recommended_measures": [ + { + "measure": "Internal Wall Insulation", + "description": "100mm internal wall insulation", + "hlp": (22.35 * 3.24) + (22.13 * 2.53), + "sap_points": 8, + "ending_sap": 52, + }, + { + "measure": "Cavity Wall Insulation", + "description": "CWI to rdSAP default standard", + "hlp": (2.68 * 2.39) + (5.93 * 2.63) + (6.13 * 2.39), # 1st & 2nd extension + "sap_points": 1, + "ending_sap": 53, + }, + { + "measure": "Ventilation", + "description": "2x DMEV fans", + "sap_points": 0, + "ending_sap": 53, + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 3, + "ending_sap": 56, + }, + { + "measure": "Solar PV", + "description": "1.6kWp Solar PV system", + "config": [ + { + "size": "1.6W", + "orientation": "South-East", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 6, + "ending_sap": 62 + }, + { + "measure": "Loft Insulation", + "description": "300mm loft insulation", + "floor_area": 63.59 + 12.31, # Based on area of main building and 1st extension + "sap_points": 8, + "ending_sap": 70, + "notes": "Loft is inaccessible from this unit - would need to be accessed from the other unit, " + "which is also owned by AIHA" + } + ], + "notes": "This property is a house split into 2 flats. We can install a PV array for both units (one array" + "per unit). Area on south-east part of roof is ~22m2 with no overshadowing. Flat roof area is 8m2" + "with modest overshadowing. We suggest a 3.2kWp system, across two units" + }, + { + "survey_key": "AIH001-11", + "starting_sap": 59, + "recommended_measures": [ + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 4, + "ending_sap": 63, + }, + { + "measure": "Internal Wall Insulation", + "description": "100mm internal wall insulation", + "hlp": (18.50 * 3.12) + (19.00 * 2.75), + "sap_points": 5, + "ending_sap": 68, + }, + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 1, + "ending_sap": 69, + } + ] + }, + { + "survey_key": "AIH001-12", + "starting_sap": 46, + "recommended_measures": [ + { + "measure": "Double Glazing", + "description": "Installation of double glazing", + "n_windows": 20, # Counted the bay windows each as 3 + "windows_area": 10.66, + "sap_points": 3, + "ending_sap": 49, + }, + # { + # "measure": "Solar PV", + # "description": "3.2kWp Solar PV system", + # "config": [ + # { + # "size": "3.2W", + # "orientation": "East", + # "elavation": 30, + # "overshading": "Little or none", + # } + # ], + # "sap_points": 9, + # "ending_sap": 58 + # }, + { + "measure": "Air Source Heat Pump", + "description": "Ecoforest ecoAIR EVI 4-20 20kW air source heat pump (+TTZC)", + "sap_points": 15, + "ending_sap": 65 + }, + { + "measure": "Tariff Review", + "description": "Switch to 24-hour tariff", + "sap_points": 15, + "ending_sap": 80 + } + ] + }, + { + "survey_key": "AIH001-13", + "starting_sap": 53, + "recommended_measures": [ + { + "measure": "Roof Insulation", + "description": "100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)", + "floor_area": 39.75, # based on the floor area of the RIR + "sap_points": 6, + "ending_sap": 59, + }, + { + "measure": "Flat Roof Insulation", + "description": "100mm flat roof insulation", + "floor_area": 33.06, # Based on area of the extension + "sap_points": 2, + "ending_sap": 61, + }, + { + "measure": "Cavity Wall Insulation", + "description": "CWI to rdSAP default standard", + "hlp": (35.40 * 2.65) + (26.70 * 2.73) + (16.30 * 2.71), # 1st & 2nd extension + "sap_points": 6, + "ending_sap": 67, + }, + { + "measure": "Ventilation", + "description": "2x DMEV fans", + "sap_points": 0, + "ending_sap": 67, + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 2, + "ending_sap": 69, + }, + { + "measure": "Solar PV", + "description": "4kWp Solar PV system", + "config": [ + { + "size": "4kW", + "orientation": "Horizontal", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 9, + "ending_sap": 78 + } + ] + }, + { + "survey_key": "AIH001-14", + "starting_sap": 63, + "recommended_measures": [ + { + "measure": "Cavity Wall Insulation", + "description": "CWI to rdSAP default standard", + "hlp": (11.00 * 2.6) + (11.00 * 2.65) + (4.60 * 2.7), + "sap_points": 5, + "ending_sap": 68, + }, + { + "measure": "Ventilation", + "description": "2x DMEV fans", + "sap_points": 0, + "ending_sap": 68, + }, + { + "measure": "Loft Insulation", + "description": "300mm loft insulation", # Based on area of main building + "floor_area": 59.20, + "sap_points": 1, + "ending_sap": 69, + }, + { + "measure": "Solar PV", + "description": "3.2kWp Solar PV system", + "sap_points": 10, + "ending_sap": 79, + } + ] + }, + { + "survey_key": "AIH001-15", + "starting_sap": 60, + "recommended_measures": [ + { + "measure": "Loft Insulation", + "description": "300mm loft insulation", + "floor_area": 73.81, # Based on area of main building + "sap_points": 1, + "ending_sap": 61, + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 3, + "ending_sap": 64, + }, + { + "measure": "Solar PV", + "description": "3.2kWp Solar PV system", + "config": [ + { + "size": "3.2W", + "orientation": "North-West", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 7, + "ending_sap": 71, + "notes": "The array is North-west facing and therefore will be slightly less efficient than south" + "facing, however the impact is not so severe as to make the installation not worthwhile." + "Ground mounted" + } + ] + }, + { + "survey_key": "AIH001-16", + "starting_sap": 60, + "recommended_measures": [ + { + "measure": "Cavity Wall Insulation", + "description": "CWI to rdSAP default standard", + "hlp": (21.56 * 2.60) + (26.79 * 2.8) + (6.74 * 2.60), + "sap_points": 4, + "ending_sap": 64, + }, + { + "measure": "Ventilation", + "description": "2x DMEV fans", + "sap_points": 0, + "ending_sap": 64, + }, + { + "measure": "Loft Insulation", + "description": "300mm loft insulation", + "floor_area": 20.92, # Based on floor area of RIR + "sap_points": 1, + "ending_sap": 65, + }, + { + "measure": "Solar PV", + "description": "2.4kWp Solar PV system", + "config": [ + { + "size": "2.4W", + "orientation": "South-East", + "elavation": 30, + "overshading": "Modest", + } + ], + "sap_points": 5, + "ending_sap": 70, + } + ] + }, + { + "survey_key": "AIH001-17", + "starting_sap": 62, + "recommended_measures": [ + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 1, + "ending_sap": 63, + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 3, + "ending_sap": 66, + }, + { + "measure": "Solar PV", + "description": "4kWp Solar PV system", + "config": [ + { + "size": "3.2kW", + "orientation": "East", + "elavation": 30, + "overshading": "None or little", + }, + { + "size": "0.8kW", + "orientation": "West", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 12, + "ending_sap": 78, + } + ] + }, + { + "survey_key": "AIH001-18", + "starting_sap": 58, + "recommended_measures": [ + { + "measure": "Loft Insulation", + "description": "300mm loft insulation", + "floor_area": 37.52, # Based on area of main building and 1st extension + "sap_points": 7, + "ending_sap": 65, + }, + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 1, + "ending_sap": 66, + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 2, + "ending_sap": 68, + }, + { + "measure": "Solar PV", + "description": "3.2kWp Solar PV system", + "config": [ + { + "size": "3.2W", + "orientation": "North-East", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 7, + "ending_sap": 75, + } + ], + + }, + { + "survey_key": "AIH001-19", + "starting_sap": 76, + "recommended_measures": [] + }, + { + "survey_key": "AIH001-20", + "starting_sap": 82, + "recommended_measures": [] + }, + { + "survey_key": "AIH001-21", + "starting_sap": 53, + "recommended_measures": [ + { + "measure": "Cylinder Insulation", + "description": "80mm cylinder insulation", + "sap_points": 2, + "ending_sap": 55, + }, + { + "measure": "Roof Insulation", + "description": "100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)", + "floor_area": 22.80, # Based on floor area of RIR + "sap_points": 7, + "ending_sap": 62, + }, + { + "measure": "Solar PV", + "description": "2.4kWp Solar PV system", + "config": [ + { + "size": "1.6kWp", + "orientation": "Horizontal", + "elavation": 30, + "overshading": "None or little", + }, + { + "size": "0.8kWp", + "orientation": "South-East", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 9, + "ending_sap": 71, + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 3, + "ending_sap": 74, + } + ] + }, + { + "survey_key": "AIH001-SIMULATED-01", + "elmhurst_reference": "000020", + "starting_sap": None, + "recommended_measures": [ + { + "measure": "Internal Wall Insulation", + "description": "100mm internal wall insulation", + "hlp": (22.35 * 3.24) + (22.13 * 2.53), + "sap_points": 8, + "ending_sap": 52, + }, + { + "measure": "Cavity Wall Insulation", + "description": "CWI to rdSAP default standard", + "hlp": (2.68 * 2.39) + (5.93 * 2.63) + (6.13 * 2.39), # 1st & 2nd extension + "sap_points": 1, + "ending_sap": 53, + }, + { + "measure": "Ventilation", + "description": "2x DMEV fans", + "sap_points": 0, + "ending_sap": 53, + }, + { + "measure": "TTZC", + "description": "Smart Thermostat", + "sap_points": 3, + "ending_sap": 56, + }, + { + "measure": "Solar PV", + "description": "1.6kWp Solar PV system", + "config": [ + { + "size": "1.6W", + "orientation": "South-East", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 6, + "ending_sap": 62 + }, + { + "measure": "Loft Insulation", + "description": "300mm loft insulation", + "floor_area": 63.59 + 12.31, # Based on area of main building and 1st extension + "sap_points": 8, + "ending_sap": 70, + "notes": "Loft is inaccessible from this unit - would need to be accessed from the other unit, " + "which is also owned by AIHA" + } + ], + "notes": "This was cloned from 80A. There is no existing data for 80B" + }, + { + "survey_key": "AIH001-SIMULATED-05", + "starting_sap": 68, + "recommended_measures": [ + { + "measure": "Loft Insulation", + "description": "300mm loft insulation", + "floor_area": 42.5, + "sap_points": 1, + "ending_sap": 69, + }, + { + "measure": "Solar PV", + "description": "3.2kWp Solar PV system", + "config": [ + { + "size": "3.2W", + "orientation": "North-East", + "elavation": 30, + "overshading": "None or little", + } + ], + "sap_points": 8, + "ending_sap": 77, + } + ] + } + ] + + scaffolding_data = [ + { + "number_of_floors": 2, + "price": 841, + }, + { + "number_of_floors": 3, + "price": 1077, + } + ] + + # TODO - Need an update cost for cylinder insulation + pricing_data = [ + {'item': '80mm cylinder insulation', 'unit_price': 50, 'unit': 'unit'}, + {'item': '100mm internal wall insulation', 'unit_price': 244.8, 'unit': 'hlp_m2'}, + {'item': 'CWI to rdSAP default standard', 'unit_price': 14.21, 'unit': 'hlp_m2'}, + {'item': 'Window draught proofing improvements', 'unit_price': 63, 'unit': 'window'}, + {'item': '100mm flat roof insulation', 'unit_price': 195, 'unit': 'floor_m2'}, + {'item': 'Switch to 24-hour tariff', 'unit_price': 0, 'unit': None}, + {'item': 'Installation of double glazing', 'unit_price': 1074, 'unit': 'window'}, + {'item': 'Ecoforest ecoAIR EVI 4-20 20kW air source heat pump (+TTZC)', 'unit_price': 21189 + 1200, + 'unit': 'unit'}, + {'item': '100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)', 'unit_price': 244.80, + 'unit': 'floor_m2'}, + {'item': '300mm loft insulation', 'unit_price': 16.07, 'unit': 'floor_m2'}, + {'item': 'Smart Thermostat', 'unit_price': 1200, 'unit': 'unit'}, + {'item': '2x DMEV fans', 'unit_price': 1070, 'unit': 'unit'}, + {'item': '1.6kWp Solar PV system', 'unit_price': 3040, 'unit': 'unit_needs_scaffolding'}, + {'item': '2kWp Solar PV system', 'unit_price': 3201, 'unit': 'unit_needs_scaffolding'}, + {'item': '2.4kWp Solar PV system', 'unit_price': 3363, 'unit': 'unit_needs_scaffolding'}, + {'item': '3.2kWp Solar PV system', 'unit_price': 3686, 'unit': 'unit_needs_scaffolding'}, + {'item': '4kWp Solar PV system', 'unit_price': 4009, 'unit': 'unit_needs_scaffolding'}, + {'item': '5.6kWp Solar PV system', 'unit_price': 5015, 'unit': 'unit_needs_scaffolding'}, + ] + pricing_data = pd.DataFrame(pricing_data) + + for recommendation in recommended_measures: + property_data = measures_data[measures_data["survey_key"] == recommendation["survey_key"]].squeeze() + total_cost = 0 + + for measure in recommendation["recommended_measures"]: + measure_pricing = pricing_data[pricing_data["item"] == measure["description"]] + measure_unit = measure_pricing["unit"].values[0] + + if measure_unit in ["unit", None]: + measure_cost = float(measure_pricing["unit_price"].values[0]) + elif measure_unit == "unit_needs_scaffolding": + n_floors = property_data["number_of_floors"] + scaffolding_cost = [x for x in scaffolding_data if x["number_of_floors"] == n_floors][0]["price"] + measure_cost = float(measure_pricing["unit_price"].values[0]) + scaffolding_cost + elif measure_unit == "floor_m2": + measure_cost = float(measure_pricing["unit_price"].values[0]) * measure["floor_area"] + elif measure_unit == "hlp_m2": + measure_cost = float(measure_pricing["unit_price"].values[0]) * measure["hlp"] + elif measure_unit == "window": + measure_cost = float(measure_pricing["unit_price"].values[0]) * measure["n_windows"] + else: + raise Exception("Unknown unit type") + + measure["Total Cost"] = measure_cost + total_cost += measure_cost + + recommendation["total_cost"] = total_cost + + # Step 1: Normalize the recommended_measures data into a DataFrame. + normalized_measures = [] + for survey in recommended_measures: + survey_key = survey["survey_key"] + starting_sap = survey["starting_sap"] + total_cost = survey.get("total_cost", 0) + + for measure in survey.get("recommended_measures", []): + # Include hlp and floor_area for each measure if available + hlp = measure.get("hlp", None) + floor_area = measure.get("floor_area", None) + + normalized_measures.append({ + "survey_key": survey_key, + "hlp": hlp, + "floor_area": floor_area, + "starting_sap": starting_sap, + "measure": measure["measure"], + "description": measure.get("description", ""), + "sap_points": measure.get("sap_points", 0), + "measure_cost": measure.get("Total Cost", 0), + "total_cost": total_cost + }) + + # Convert the normalized list into a DataFrame. + measures_df = pd.DataFrame(normalized_measures) + + # Step 2: Pivot the measures_df to have a column for each measure type, using the description as values. + pivoted_measures = measures_df.pivot_table( + index="survey_key", + columns="measure", + values="description", + aggfunc=lambda x: ' '.join(x), # Concatenate descriptions if there are multiple entries. + fill_value=None + ).reset_index() + + measures_columns = [x for x in pivoted_measures.columns if x not in ["survey_key"]] + # We add a "Cost of" column for each measure + for measure in measures_columns: + pivoted_measures[f"Cost of {measure}"] = None + + pivoted_floor_area = measures_df.pivot_table( + index="survey_key", + columns="measure", + values="floor_area", + aggfunc="first" # Use 'first' since each measure should only appear once per survey_key + ).add_prefix("floor_area - ").reset_index() + + pivoted_hlp = measures_df.pivot_table( + index="survey_key", + columns="measure", + values="hlp", + aggfunc="first" + ).add_prefix("hlp - ").reset_index() + + # Merge hlp and floor_area data + pivoted_measures = pivoted_measures.merge(pivoted_hlp, on="survey_key", how="left") + pivoted_measures = pivoted_measures.merge(pivoted_floor_area, on="survey_key", how="left") + + # Step 3: Calculate the total sap points and total cost for each survey. + totals = measures_df.groupby("survey_key").agg( + total_sap_points=("sap_points", "sum"), + ).reset_index() + + # Merge total sap points into the pivoted measures. + pivoted_measures = pd.merge(pivoted_measures, totals, on="survey_key", how="left") + # pivoted_measures["Cost Contingency"] = pivoted_measures["total_cost_of_measures"] * CONTINGENCY_RATE + # pivoted_measures["Total Cost"] = pivoted_measures["total_cost_of_measures"] + pivoted_measures["Cost Contingency"] + + # Step 4: Extract starting SAP for each survey key. + starting_sap_df = measures_df.drop_duplicates(subset=["survey_key"])[["survey_key", "starting_sap"]] + + # Merge starting SAP back onto pivoted measures. + result_df = pd.merge(pivoted_measures, starting_sap_df, on="survey_key", how="left") + + # Step 5: Calculate the ending SAP. + result_df["Ending SAP"] = result_df["starting_sap"] + result_df["total_sap_points"] + result_df["Ending EPC Rating"] = result_df["Ending SAP"].apply(sap_to_epc) + + # Step 6: Merge the result with the measures_data to get the final DataFrame. + final_measures = measures_data.merge( + result_df, how="left", on="survey_key" + ) + + final_measures.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/Measures packages.csv") + + # Store costs + pricing_data.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/Pricing data.csv") + +# if __name__ == "__main__": +# main() diff --git a/etl/customers/benyon/epc_data.py b/etl/customers/benyon/epc_data.py new file mode 100644 index 00000000..9ba71f2f --- /dev/null +++ b/etl/customers/benyon/epc_data.py @@ -0,0 +1,71 @@ +""" +Rough script to get the EPC data for Benyon +""" + +import pandas as pd +import os +from dotenv import load_dotenv +from backend.SearchEpc import SearchEpc +from asset_list.utils import get_data + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + +asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Benyon Estate/List of All Properties ecl Grd Rents in " + "Alphabetical Order.xlsx", + header=1 +) +asset_list.columns = ["tennancy", "landlord_id", "landlord_address"] +# Get postcode as the last 2 parts of the address, split on space +asset_list["postcode"] = asset_list["landlord_address"].apply(lambda x: x.split(" ")[-2] + " " + x.split(" ")[-1]) + +asset_list["house_no"] = asset_list.apply( + lambda x: SearchEpc.get_house_number(address=x["landlord_address"], postcode=x["postcode"]), axis=1 +) + +epc_data, errors, no_epc = get_data( + df=asset_list, + manual_uprn_map={}, + epc_auth_token=EPC_AUTH_TOKEN, + uprn_column=None, + fulladdress_column="landlord_address", + address1_column="house_no", + postcode_column="postcode", + property_type_column=None, + built_form_column=None, + epc_api_only=True, + row_id_name="landlord_id", +) + +df = asset_list[asset_list["landlord_id"].isin(no_epc)] +epc_df = pd.DataFrame(epc_data) +epc_df["current-energy-rating"].value_counts() +epc_df["property-type"].value_counts() +epc_df["walls-description"].value_counts(normalize=True) + +asset_list = asset_list.merge( + epc_df[ + [ + "landlord_id", "current-energy-rating", "property-type", "total-floor-area", "roof-description", + "walls-description", "co2-emissions-current" + ] + ], + how="left", + left_on="landlord_id", + right_on="landlord_id" +) +asset_list.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Benyon Estate/asset_list.csv", index=False +) + +asset_list_big = asset_list.merge( + epc_df, + how="left", + left_on="landlord_id", + right_on="landlord_id" +) +asset_list_big.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Benyon Estate/asset_list_full_data.csv", + index=False +) diff --git a/etl/customers/bromford/data_cleanup.py b/etl/customers/bromford/data_cleanup.py new file mode 100644 index 00000000..45429523 --- /dev/null +++ b/etl/customers/bromford/data_cleanup.py @@ -0,0 +1,192 @@ +""" +12th April 2025 +This script attempts to clean up the various pieces of data we have for Bromford, with the intention of producing a +standardised asset list +""" + +import pandas as pd + +# Step 1 +# The inspectons data is spread across three different files. We attempt to produce one finalised asset list, with +# comprehensive inspections + +# Primary asset list +asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Bromford Asset " + "List.xlsx", + sheet_name="Asset List" +) + +# +inspections_1 = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD " + "MDS.xlsx", + sheet_name="Data list" +) +inspections_1["Heating Type"] = (inspections_1["Heating Type"] + " " + inspections_1["Heating fuel"]).str.strip() + +inspections_2 = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD " + "MERLIN LANE.xlsx", + sheet_name="Report" +) +inspections_2["AssetTypeDesc"] = inspections_2["PropertyType"].str.split(" ").str[-1] +inspections_2["PropTypeDesc"] = inspections_2["PropertyType"].str.split(" ").str[:-1].str.join(" ") + +inspections_3 = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD " + "SEVERN VALE - KLARKE.xlsx", + sheet_name="Asset report" +) + +inspections_3["FullAddress"] = inspections_3["T1_Address1"] + ", " + inspections_3["T1_Address2"] + +# On inspections 3, we have multiple sheets which describe the heating +heating_systems = [] +for sheet_name in [ + "Storage Heaters", "No Heating", "Underfloor Heating", "Rointe Electric Heating", "Air Source Heating", + "Gas Central Heating", "Electric Boiler", "Oil Fired Central Heating", + "Communal Boilers", "Panel Heaters" +]: + df = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme " + "Rebuild/Inspections/BROMFORD " + "SEVERN VALE - KLARKE.xlsx", + sheet_name=sheet_name + ) + df = df[["UPRN"]] + df["Heating Type"] = sheet_name + heating_systems.append(df) + +heating_systems = pd.concat(heating_systems) +# We have no clue which one is correct, we have some dupes +heating_systems = heating_systems.drop_duplicates("UPRN") +heating_systems = heating_systems.rename(columns={"UPRN": "Asset"}) +heating_systems["Asset"] = heating_systems["Asset"].astype(int) + +inspections_3 = inspections_3.merge(heating_systems, how="left", on="Asset") + +# Create a consolidated inspections sheet +inspections = pd.concat( + [ + inspections_1[["Asset", "Construction type", 'Heating Type', "WFT Findings", "Eligibility (Red/Yellow/Green)"]], + inspections_2[["Asset", "Construction type", "WFT Findings", "Eligibility (Red/Yellow/Green)"]], + inspections_3[["Asset", 'Heating Type', "WFT Findings", "Eligibility (Red/Yellow/Green)"]], + ] +) + +inspections_address_data = pd.concat( + [ + inspections_1[ + ["Asset", "FullAddress", "PostCode", "ConYear", "Beds", "AssetTypeDesc", "PropTypeDesc", 'ManAreaDesc', ] + ], + inspections_2[ + ['Asset', 'FullAddress', 'AccomType', "AssetTypeDesc", "PropTypeDesc", 'ConYear', 'Postcode'] + ].rename(columns={"Postcode": "PostCode"}), + inspections_3[ + ['Asset', "FullAddress", 'T1_Postcode', 'T1_Build Year', 'T1_AssetType'] + ].rename( + columns={"T1_Postcode": "PostCode", "T1_Build Year": "ConYear", "T1_AssetType": "AssetTypeDesc"} + ), + ] +) + +# Remove some error values +inspections = inspections[~inspections["Asset"].isin( + [ + "They're all green partial fill they're all green this", + "South Staffordshire District Council", + 'Blk Milton Crt F9-10, Perton, Wolverhampton' + ] +)] + +inspections["Asset"] = inspections["Asset"].astype(str) +asset_list["Asset"] = asset_list["Asset"].astype(str) +inspections_address_data["Asset"] = inspections_address_data["Asset"].astype(str) +inspections['WFT Findings'] = inspections['WFT Findings'].replace(r'^\s*$', pd.NA, regex=True) + +# We have some cases where the inspetions data has dupes on Asset (the ID column). We take the instance that is +# populated +inspections = inspections.sort_values(by='WFT Findings', na_position='last') +inspections = inspections.drop_duplicates(subset='Asset', keep='first') + +# We have dupes in the asset list +asset_list = asset_list.drop_duplicates("Asset") + +# Merge on +missed_asset_ids = inspections[ + ~inspections["Asset"].isin(asset_list["Asset"].values) +]["Asset"].values + +missed_assets = inspections_address_data[ + inspections_address_data["Asset"].isin(missed_asset_ids) +] +missed_assets = missed_assets.drop_duplicates("Asset") + +# We produce a larger asset list +asset_list = pd.concat([asset_list, missed_assets]) + +asset_list = asset_list.merge( + inspections, how="left", on="Asset" +) +asset_list["WFT Findings"] = asset_list["WFT Findings"].fillna("No Inspections Note") + +# Store +# asset_list.to_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared " +# "data/asset_list.xlsx" +# ) + +# We now prepare outcomes into a single file +pv_outcomes = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Bromford PV " + "Outcomes.csv", + encoding='cp1252' +) +pv_outcomes["measure_type"] = "solar" + +other_outcomes = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/(Bromford) " + "15.04.2024.xlsx", + sheet_name="ECO4 & GBIS", + header=1 +) +other_outcomes["measure_type"] = "cwi" + +combined_outcomes = pd.concat( + [ + other_outcomes[["NO", "ADDRESS", "POSTCODE", "WEEK COMMENCING", "OUTCOMES", "NOTES"]].rename( + columns={ + "NO": "No", "ADDRESS": "Address", "POSTCODE": "Postcode", "WEEK COMMENCING": "Week Commencing", + "OUTCOMES": "Outcome", "NOTES": "Notes" + } + ), + pv_outcomes[['No', 'Address', 'Postcode', "Week Commencing", "Outcome", "Notes"]] + ] +) + +# Store +# combined_outcomes.to_excel( +# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared " +# "data/outcomes.xlsx" +# ) + +# Submissions sheet - +eco3_submissions = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 3 Submissions.csv", + encoding='cp1252' +) +# Get rid of the unnamed columns +unnamed_columns = [c for c in eco3_submissions.columns if "Unnamed: " in c] +eco3_submissions = eco3_submissions.drop(columns=unnamed_columns) +# Store +eco3_submissions.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 3 submissions.csv", + index=False +) + +eco4_submissions = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 4 submissions.csv", +) + +same_cols = [c for c in eco4_submissions.columns if c in eco3_submissions.columns] diff --git a/etl/customers/cambridge/remote_assessment.py b/etl/customers/cambridge/remote_assessment.py new file mode 100644 index 00000000..dc5beff5 --- /dev/null +++ b/etl/customers/cambridge/remote_assessment.py @@ -0,0 +1,138 @@ +import os +import time + +from tqdm import tqdm +import pandas as pd +from dotenv import load_dotenv +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc +from backend.SearchEpc import SearchEpc +from utils.s3 import save_csv_to_s3 + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +USER_ID = 8 +PORTFOLIO_ID = 122 + + +def app(): + asset_list = [ + { + "address": "12 Church Lane", "postcode": "CB23 8AF", "uprn": 100090136018, + "property_type": "House", "built-form": "Semi-Detached" + }, + { + "address": "21 High Street", "postcode": "CB23 8AB", "uprn": 100090144815 + }, + { + "address": "22 High Street", "postcode": "CB23 8AB", "uprn": 100090144816 + }, + { + "address": "5 Bunkers Hill", "postcode": "CB3 0LY", "uprn": 10008078615 + }, + { + "address": "6 Bunkers Hill", "postcode": "CB3 0LY", "uprn": 10008078616 + }, + { + "address": "7 Bunkers Hill", "postcode": "CB3 0LY", "uprn": 10008078617 + }, + { + "address": "32 George Nuttall Close", "postcode": "CB4 1YE", "uprn": 200004200075 + }, + { + "address": "33 George Nuttall Close", "postcode": "CB4 1YE", "uprn": 200004200076 + }, + { + "address": "35 George Nuttall Close", "postcode": "CB4 1YE", "uprn": 200004200078 + }, + { + "address": "36 George Nuttall Close", "postcode": "CB4 1YE", "uprn": 200004200079 + } + ] + asset_list = pd.DataFrame(asset_list) + + valuations_data = [ + {'uprn': 100090136018, "valuation": 586_000}, + {'uprn': 100090144815, "valuation": 446_000}, + {'uprn': 100090144816, "valuation": 448_000}, + {'uprn': 10008078615, "valuation": 763_000}, + {'uprn': 10008078616, "valuation": 616_000}, + {'uprn': 10008078617, "valuation": 593_000}, + {'uprn': 200004200075, "valuation": 450_000}, + {'uprn': 200004200076, "valuation": 457_000}, + {'uprn': 200004200078, "valuation": 304_000}, + {'uprn': 200004200079, "valuation": 313_000} + ] + + # Pull the additional data + extracted_data = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + add1 = home["address"] + pc = home["postcode"] + # Retrieve the EPC data + epc_searcher = SearchEpc( + address1=add1, + postcode=pc, uprn=home["uprn"], auth_token=EPC_AUTH_TOKEN, os_api_key="" + ) + epc_searcher.find_property(skip_os=True) + if epc_searcher.newest_epc is None: + continue + + find_epc_searcher = RetrieveFindMyEpc(address=epc_searcher.newest_epc["address1"], + postcode=epc_searcher.newest_epc["postcode"]) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + time.sleep(0.5) + # We need uprn + + extracted_data.append( + { + "uprn": home["uprn"], + **find_epc_data, + } + ) + + non_invasive_recommendations = [ + { + "uprn": r["uprn"], + "recommendations": r["recommendations"] + } for r in extracted_data + ] + + filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(asset_list), + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + # Store the non-invasive recommendations in s3 + non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(non_invasive_recommendations), + bucket_name="retrofit-plan-inputs-dev", + file_name=non_invasive_recommendations_filename + ) + + # Store the valuations data in s3 + valuations_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(valuations_data), + bucket_name="retrofit-plan-inputs-dev", + file_name=valuations_filename + ) + + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Private", + "goal": "Increasing EPC", + "goal_value": "B", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": valuations_filename, + "scenario_name": "Wave 3 Packages", + "multi_plan": True, + "budget": None, + "exclusions": [] + } + print(body) diff --git a/etl/customers/connells/pilot_remote_assessments.py b/etl/customers/connells/pilot_remote_assessments.py new file mode 100644 index 00000000..799bd805 --- /dev/null +++ b/etl/customers/connells/pilot_remote_assessments.py @@ -0,0 +1,108 @@ +import os +import time + +from tqdm import tqdm +import pandas as pd +from dotenv import load_dotenv +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc +from backend.SearchEpc import SearchEpc +from utils.s3 import save_csv_to_s3 + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +USER_ID = 8 +PORTFOLIO_ID = 123 + + +def app(): + asset_list = [ + {"address": "1 Raven Crescent", "postcode": "WV11 2EX", "uprn": 100071188496}, + + {"address": "13 Bayliss Avenue", "postcode": "WV11 2EX", "uprn": 100071136271}, + + {"address": "30 Southbourne Road", "postcode": "WV10 6ET", "uprn": 100071194376}, + + {"address": "96 Marsh Lane", "postcode": "WV10 6RX", "uprn": 100071176297}, + ] + asset_list = pd.DataFrame(asset_list) + + valuations_data = [ + {'uprn': 100071188496, "valuation": 175_000}, + {'uprn': 100071136271, "valuation": 183_000}, + {'uprn': 100071194376, "valuation": 221_000}, + {'uprn': 100071176297, "valuation": 208_000}, + ] + + # Pull the additional data + extracted_data = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + add1 = home["address"] + pc = home["postcode"] + # Retrieve the EPC data + epc_searcher = SearchEpc( + address1=add1, + postcode=pc, uprn=home["uprn"], auth_token=EPC_AUTH_TOKEN, os_api_key="" + ) + epc_searcher.find_property(skip_os=True) + if epc_searcher.newest_epc is None: + continue + + find_epc_searcher = RetrieveFindMyEpc(address=epc_searcher.newest_epc["address1"], + postcode=epc_searcher.newest_epc["postcode"]) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + time.sleep(0.5) + # We need uprn + + extracted_data.append( + { + "uprn": home["uprn"], + **find_epc_data, + } + ) + + non_invasive_recommendations = [ + { + "uprn": r["uprn"], + "recommendations": r["recommendations"] + } for r in extracted_data + ] + + filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(asset_list), + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + # Store the non-invasive recommendations in s3 + non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(non_invasive_recommendations), + bucket_name="retrofit-plan-inputs-dev", + file_name=non_invasive_recommendations_filename + ) + + # Store the valuations data in s3 + valuations_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(valuations_data), + bucket_name="retrofit-plan-inputs-dev", + file_name=valuations_filename + ) + + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Private", + "goal": "Increasing EPC", + "goal_value": "B", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": valuations_filename, + "scenario_name": "Wave 3 Packages", + "multi_plan": True, + "budget": None, + "exclusions": [] + } + print(body) diff --git a/etl/customers/cottons/parse_pdf_asset_list.py b/etl/customers/cottons/parse_pdf_asset_list.py new file mode 100644 index 00000000..7d442e97 --- /dev/null +++ b/etl/customers/cottons/parse_pdf_asset_list.py @@ -0,0 +1,64 @@ +import re +import pandas as pd +from PyPDF2 import PdfReader + +# Paths to the uploaded files +file_paths = [ + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged).pdf", + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 2.pdf", + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 3.pdf", + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 4.pdf", + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 5.pdf", + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 6.pdf" +] + + +# Function to extract text from PDFs +def extract_text_from_pdf_with_pypdf2(file_path): + text = "" + reader = PdfReader(file_path) + for page in reader.pages: + text += page.extract_text() + return text + + +# Initialize a list to hold all parsed data +all_parsed_data = [] + +# Process each PDF individually +for i, path in enumerate(file_paths): + # Extract text from the PDF + extracted_text = extract_text_from_pdf_with_pypdf2(path) + + # Step 1: Remove titles and repeated headers + cleaned_text = re.sub(r"Managed Property Report as at \d+ \w+ \d+", "", extracted_text) + cleaned_text = re.sub(r"Code Property Address Management Type", "", cleaned_text) + + # Step 2: Extract rows ending with "Managed" + rows = re.findall(r".*?Managed", cleaned_text) + + # Step 3: Parse rows into structured data + parsed_data = [] + for row in rows: + match = re.match(r"(\S+)\s+(.+?)\s+Managed", row.strip()) + if match: + code = match.group(1).strip() + address = match.group(2).strip() + parsed_data.append((code, address, "Managed")) + + # Append parsed data to the global list + all_parsed_data.extend(parsed_data) + + # Provide feedback for debugging + print(f"File {i + 1} processed: {len(parsed_data)} rows") + +# Step 4: Create a unified DataFrame +final_df = pd.DataFrame(all_parsed_data, columns=["Code", "Property Address", "Management Type"]) + +# Step 5: Save the unified DataFrame to an Excel file +final_output_file_path = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unified_Managed_Properties_List.xlsx" +final_df.to_excel(final_output_file_path, index=False) + +# Provide feedback +print(f"All files processed and combined. Total rows: {len(final_df)}") +print(f"Unified file saved to: {final_output_file_path}") diff --git a/etl/customers/cottons/prep_asset_list.py b/etl/customers/cottons/prep_asset_list.py new file mode 100644 index 00000000..db7c6583 --- /dev/null +++ b/etl/customers/cottons/prep_asset_list.py @@ -0,0 +1,15 @@ +import pandas as pd + +df = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List.xlsx" +) + +# split up the address on commas. First section is address1, last seciton is postcode +df["address1"] = df["Property Address"].apply(lambda x: x.split(",")[0].strip()) +df["postcode"] = df["Property Address"].apply(lambda x: x.split(",")[-1].strip()) + +# Re-save +df.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List.xlsx", + index=False, +) diff --git a/etl/customers/cottons/remote_assessments.py b/etl/customers/cottons/remote_assessments.py new file mode 100644 index 00000000..7855a1a9 --- /dev/null +++ b/etl/customers/cottons/remote_assessments.py @@ -0,0 +1,124 @@ +import os +import time + +from tqdm import tqdm +import pandas as pd +from dotenv import load_dotenv +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc +from backend.SearchEpc import SearchEpc +from utils.s3 import save_csv_to_s3 + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +USER_ID = 8 +PORTFOLIO_ID = 121 + + +def app(): + """ + Prepares the inputs to produce the remote assessments for Cottons + :return: + """ + + # Read in the asset list + cottons_asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List EPC Data Pull with " + "valuations.xlsx" + ) + # A number are missing EPCs due to the space in the postcode + # Breakdowns: + # C 119 + # D 106 + # E 26 + # B 5 + # + # Take the EPC D/E properties + asset_list = cottons_asset_list[ + cottons_asset_list["EPC rating on register"].isin(["D", "E"]) + ] + asset_list = asset_list.reset_index(drop=True) + asset_list["row_id"] = asset_list.index + asset_list["uprn"] = asset_list["uprn"].astype(int) + + extracted_data = [] + model_asset_list = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + add1 = home["address1"] + pc = home["postcode"] + # Retrieve the EPC data + epc_searcher = SearchEpc( + address1=add1, + postcode=pc, uprn=home["uprn"], auth_token=EPC_AUTH_TOKEN, os_api_key="" + ) + epc_searcher.find_property(skip_os=True) + + find_epc_searcher = RetrieveFindMyEpc(address=epc_searcher.newest_epc["address1"], + postcode=epc_searcher.newest_epc["postcode"]) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + time.sleep(0.5) + # We need uprn + + extracted_data.append( + { + "uprn": home["uprn"], + **find_epc_data, + } + ) + + model_asset_list.append( + { + "uprn": home["uprn"], + "address": epc_searcher.newest_epc["address1"], + "postcode": epc_searcher.newest_epc["postcode"], + } + ) + + non_invasive_recommendations = [ + { + "uprn": r["uprn"], + "recommendations": r["recommendations"] + } for r in extracted_data + ] + + valuations_data = asset_list[["uprn", "Zoopla Valuation"]].copy().rename(columns={"Zoopla Valuation": "valuation"}) + valuations_data = valuations_data[~pd.isnull(valuations_data["valuation"])] + + filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(model_asset_list), + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + # Store the non-invasive recommendations in s3 + non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(non_invasive_recommendations), + bucket_name="retrofit-plan-inputs-dev", + file_name=non_invasive_recommendations_filename + ) + + # Store the valuations data in s3 + valuations_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuations.csv" + save_csv_to_s3( + dataframe=valuations_data, + bucket_name="retrofit-plan-inputs-dev", + file_name=valuations_filename + ) + + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Social", + "goal": "Increasing EPC", + "goal_value": "C", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": valuations_filename, + "scenario_name": "Wave 3 Packages", + "multi_plan": True, + "budget": None, + "exclusions": ['air_source_heat_pump', 'boiler_upgrade', 'floor_insulation'] + } + print(body) diff --git a/etl/customers/gla/hug_postcodes.py b/etl/customers/gla/hug_postcodes.py new file mode 100644 index 00000000..fc89b6f2 --- /dev/null +++ b/etl/customers/gla/hug_postcodes.py @@ -0,0 +1,77 @@ +import inspect +import pandas as pd +from pathlib import Path +from tqdm import tqdm +from etl.epc.settings import EARLIEST_EPC_DATE +from etl.spatial.OpenUprnClient import OpenUprnClient + +src_file_path = inspect.getfile(lambda: None) + +EPC_DIRECTORY = Path("/Users/khalimconn-kowlessar/Downloads/all-domestic-certificates") +epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()] + +aggregation = [] +for directory in tqdm(epc_directories): + data = pd.read_csv(directory / "certificates.csv", low_memory=False) + # Rename the columns to the same format as the api returns + data.columns = [c.replace("_", "-").lower() for c in data.columns] + + data = data[data["posttown"].str.contains("London", case=False, na=False)] + if data.empty: + continue + # Take just date before the date threshold + data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE] + + data = data[~pd.isnull(data["uprn"])] + data["uprn"] = data["uprn"].astype(int) + # Take just the newest EPC per uprn, based on lodgement-date + data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn") + # Take EPC D and below + data = data[data["current-energy-rating"].isin(["D", "E", "F", "G"])] + data["postal_region"] = data["postcode"].str.split(" ").str[0] + + # Take homes that don't have a gas boiler + off_gas = data[~data["main-fuel"].str.contains("mains gas", case=False, na=False)] + + if off_gas.empty: + continue + + # Remote properties with conservation area issues + uprns = off_gas["uprn"].unique() + # Get data + ca_data = OpenUprnClient.get_spatial_data(uprns, "retrofit-data-dev") + off_gas = off_gas.merge( + ca_data[["UPRN", "conservation_status", "is_listed_building", "is_heritage_building"]].rename( + columns={"UPRN": "uprn"} + ), + how="left", + on="uprn", + ) + # Remove any restricted units + off_gas = off_gas[ + (off_gas["conservation_status"] != True) + & (off_gas["is_listed_building"] != True) + & (off_gas["is_heritage_building"] != True) + ] + + off_gas = off_gas[ + off_gas["tenure"].isin(["rental (private)", "Rented (private)", "owner-occupied", "Owner-occupied"]) + ] + + region_summary = off_gas.groupby("postal_region").size().reset_index(name="count") + + aggregation.append(region_summary) + +postal_region_aggregation = pd.concat(aggregation) +# Re-aggregate +postal_region_aggregation = postal_region_aggregation.groupby("postal_region")["count"].sum().reset_index() + +postal_region_aggregation = postal_region_aggregation.sort_values("count", ascending=False) +postal_region_aggregation = postal_region_aggregation.rename( + columns={"postal_region": "Postcode Region", "count": "Number of Homes"} +) +postal_region_aggregation.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/GLA/Off Gas EPC D-G Postal Regions - without conservation " + "area.xlsx", + index=False +) diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py new file mode 100644 index 00000000..0bf6eb18 --- /dev/null +++ b/etl/customers/ksquared/Wave3 Modelling.py @@ -0,0 +1,425 @@ +import os +import time +import re + +from etl.epc.settings import EARLIEST_EPC_DATE +from dotenv import load_dotenv +from tqdm import tqdm +import pandas as pd +import numpy as np +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc +from etl.spatial.OpenUprnClient import OpenUprnClient +from backend.SearchEpc import SearchEpc +from utils.s3 import save_csv_to_s3 + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") +USER_ID = 8 +PORTFOLIO_ID = 117 +CAHA_PORTFOLIO_ID = 118 + + +def hornsey(): + """ + This script prepares the asset lists for the additional housing associations, CAHA and Hornsey Housing Trust, + that are forming a consortium led by AIHA + :return: + """ + + hornsey_asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/SHDF - Template - EOI - Hornsey Housing " + "Trust.xlsx", + sheet_name="Ksquared-All units information", + header=3 + ) + + # We don't need the first row + hornsey_asset_list = hornsey_asset_list.iloc[1:] + # Fill NA values with empty strings + hornsey_asset_list = hornsey_asset_list.fillna("") + hornsey_asset_list["Address letter or number"] = hornsey_asset_list["Address letter or number"].astype( + str + ).str.strip() + hornsey_asset_list["Postcode"] = hornsey_asset_list["Postcode"].astype(str).str.strip() + hornsey_asset_list["Street address"] = hornsey_asset_list["Street address"].astype(str).str.strip() + # Replace double spaces + for col in ["Address letter or number", "Street address", "Postcode"]: + hornsey_asset_list[col] = hornsey_asset_list[col].str.replace(" ", " ") + + hornsey_asset_list = hornsey_asset_list[hornsey_asset_list["Address letter or number"] != ""] + + hornsey_asset_list["Wall Type Cleaned"] = np.where( + "Cavity" in hornsey_asset_list["Wall type"], + "Cavity", + "Solid" + ) + + missed_uprns = { + "Flat 13A Stowell House": 100021213098, + "Flat 24 Stowell House": 100021213110, + "Flat 1 36 Haringey Park": None + } + extracted_data = [] + asset_list = [] + hornsey_asset_list["row_id"] = hornsey_asset_list.index + for _, home in tqdm(hornsey_asset_list.iterrows(), total=len(hornsey_asset_list)): + + if home["Address letter or number"] == "Flat 1 36 Haringey Park": + continue + + # Some properties do not have an epc + if not home["Energy starting band (EPC)"]: + asset_list.append( + { + "uprn": missed_uprns[home["Address letter or number"]], + "address": home["Address letter or number"], + "postcode": home["Postcode"], + "property_type": "Flat", # They're all flats + } + ) + continue + + unit_number = home["Address letter or number"] + street = home["Street address"] + postcode = home["Postcode"] + address = ", ".join([x for x in [unit_number, street] if x]) + find_epc_searcher = RetrieveFindMyEpc(address=address, postcode=postcode) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + time.sleep(0.5) + # We need uprn + searcher = SearchEpc( + address1=address, + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + full_address=address, + ) + searcher.find_property(skip_os=True) + newest_epc = searcher.newest_epc + if newest_epc["current-energy-efficiency"] != home["Energy starting band (EPC)"].split("-")[1]: + raise Exception("Something went wrong with the EPC data") + + extracted_data.append( + { + "uprn": newest_epc["uprn"], + **find_epc_data, + "hotwater-description": newest_epc["hotwater-description"], + } + ) + + asset_list.append( + { + "uprn": newest_epc["uprn"], + "row_id": home["row_id"], + "address": home["Address letter or number"], + "postcode": home["Postcode"], + "property_type": "Flat", # They're all flats + } + ) + + # Get conservation area data + # uprns = [x["uprn"] for x in extracted_data] + # conservation_area_data = OpenUprnClient.get_spatial_data(uprns, "retrofit-data-dev") + # + # addresses = pd.DataFrame(asset_list) + # addresses["uprn"] = addresses["uprn"].astype(int) + # conservation_area_df = conservation_area_data.merge(addresses, how="left", right_on="uprn", left_on="UPRN") + # conservation_area_df.to_csv( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/hornsey_conservation_area_data.csv" + # ) + + # We format the extracted data so that is has the same structure as non-intrusive recommendations + # We then get the UPRNs and create the asset list + + non_invasive_recommendations = [ + { + "uprn": r["uprn"], + "recommendations": r["recommendations"] + } for r in extracted_data + ] + for r in non_invasive_recommendations: + new_recommendations = [] + extracted = [r for r in extracted_data if r["uprn"] == r["uprn"]][0] + for rec in r["recommendations"]: + if extracted["hotwater-description"] == "Gas boiler/circulator, no cylinder thermostat": + if rec["type"] in ["hot_water_tank_insulation", "cylinder_thermostat"]: + continue + rec["survey"] = False + new_recommendations.append(rec) + r["recommendations"] = new_recommendations + + # Store the asset list in s3 + filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(asset_list), + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + # Store the non-invasive recommendations in s3 + non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(non_invasive_recommendations), + bucket_name="retrofit-plan-inputs-dev", + file_name=non_invasive_recommendations_filename + ) + + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Social", + "goal": "Increasing EPC", + "goal_value": "C", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": "", + "scenario_name": "Wave 3 Packages", + "multi_plan": True, + "budget": None, + "exclusions": ["boiler_upgrade"] + } + print(body) + + +def caha(): + caha_asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/Copy of AIHA - WHSHF Wave 3 bid - Consortium " + "member properties - CAHA.xlsx", + sheet_name="Ksquared-All units information", + header=3 + ) + + caha_asset_list = caha_asset_list.iloc[1:] + # Fill NA values with empty strings + caha_asset_list = caha_asset_list.fillna("") + caha_asset_list["Address letter or number"] = caha_asset_list["Address letter or number"].astype( + str + ).str.strip() + + # We Add POstcode as it wasn't populated - split on space and take the last two entries and re-concatenate on space + caha_asset_list["Street address"] = caha_asset_list["Street address"].str.strip() + caha_asset_list["Postcode"] = caha_asset_list["Street address"].str.split(" ").str[-2:].str.join(" ") + # Take just the columns we need + caha_asset_list = caha_asset_list[["Address letter or number", "Street address", "Postcode"]] + + for col in ["Address letter or number", "Street address", "Postcode"]: + caha_asset_list[col] = caha_asset_list[col].str.replace(" ", " ") + + # Pull the data from find my epc + remap = { + "Flat A, 50 Talbot Road N6 4QP": "50a Talbot Road", + "Flat A, 51 First Avenue EN1 1BN": "51a, First Avenue", + "Flat B, 51 First Avenue EN1 1BN": "51b, First Avenue" + } + + def remap_address(address): + # Match patterns like 'Flat A, 30 Grove Park Road' + match = re.match(r'Flat (\w), (\d+) (.+)', address) + if match: + flat_letter = match.group(1) # e.g., 'A' + number = match.group(2) # e.g., '30' + rest_of_address = match.group(3) # e.g., 'Grove Park Road' + + # Format the new address as '30A Grove Park Road' + return f"{number}{flat_letter} {rest_of_address}" + + # If pattern doesn't match, return original address + return address + + caha_asset_list["row_id"] = caha_asset_list.index + + extracted_data = [] + asset_list = [] + for _, home in tqdm(caha_asset_list.iterrows(), total=len(caha_asset_list)): + if home["Street address"] == "35 Stanford road N11 3HY" and home["Address letter or number"] == "": + continue + + if home["Street address"] == "29 Victoria Avenue N3 1BD" and home["Address letter or number"] == "": + continue + + if home["Street address"] == "11 Victoria Avenue N3 1BD" and home["Address letter or number"] == "Flat A": + continue + + if home["Street address"] == "11 Victoria Avenue N3 1BD" and home["Address letter or number"] == "Flat C": + continue + + if home["Street address"] == "10 Forest Gardens N17 6XA" and home["Address letter or number"] == "Flat C": + continue + + if home["Street address"] == "219 Cann Hall Road E11 3NJ" and home["Address letter or number"] == "Flat B": + continue + + unit_number = home["Address letter or number"] + street = home["Street address"] + postcode = home["Postcode"] + address = ", ".join([x for x in [unit_number, street] if x]) + address = remap.get(address, address) + address = address.replace(postcode, "").strip() + if "Victoria Avenue" not in address: + address = remap_address(address) + + find_epc_searcher = RetrieveFindMyEpc(address=address, postcode=postcode) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data(sap_2012_date=EARLIEST_EPC_DATE) + time.sleep(0.5) + # We need uprn + searcher = SearchEpc( + address1=address, + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + full_address=address, + ) + searcher.find_property(skip_os=True) + newest_epc = searcher.newest_epc + + uprn = newest_epc["uprn"] + if address in ["Flat D, 11 Victoria Avenue", "Flat B, 11 Victoria Avenue"]: + uprn = None + + extracted_data.append( + { + "uprn": uprn, + **find_epc_data, + } + ) + + asset_list.append( + { + "row_id": home["row_id"], + "uprn": uprn, + "address": address, + "postcode": home["Postcode"], + "property_type": newest_epc["property-type"], + "wall_type": newest_epc["walls-description"], + "built_form": newest_epc["built-form"], + "flat_storey_count": newest_epc['flat-storey-count'], + } + ) + + # Missing row ids + missed = [r for r in caha_asset_list["row_id"].tolist() if r not in [x["row_id"] for x in asset_list]] + + no_data = [x for x in asset_list if x["uprn"] in [None, ""]] + no_data = pd.DataFrame(no_data) + + # Get conservation area data + uprns = [x["uprn"] for x in extracted_data if x["uprn"] not in ["", None]] + conservation_area_data = OpenUprnClient.get_spatial_data([36284], "retrofit-data-dev") + + addresses = pd.DataFrame(asset_list) + addresses["uprn"] = addresses["uprn"].astype(str) + conservation_area_data["UPRN"] = conservation_area_data["UPRN"].astype(str) + conservation_area_df = conservation_area_data.merge(addresses, how="left", right_on="uprn", left_on="UPRN") + conservation_area_df.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/caha_conservation_area_data.csv" + ) + + non_invasive_recommendations = [ + { + "uprn": r["uprn"], + "recommendations": r["recommendations"] + } for r in extracted_data + ] + # for r in non_invasive_recommendations: + # new_recommendations = [] + # extracted = [r for r in extracted_data if r["uprn"] == r["uprn"]][0] + # for rec in r["recommendations"]: + # if extracted["hotwater-description"] == "Gas boiler/circulator, no cylinder thermostat": + # if rec["type"] in ["hot_water_tank_insulation", "cylinder_thermostat"]: + # continue + # rec["survey"] = False + # new_recommendations.append(rec) + # r["recommendations"] = new_recommendations + + # We model the two properties separately + asset_list = pd.DataFrame(asset_list) + # Drop Flat D, 11 Victoria Avenue + asset_list1 = asset_list[asset_list["address"] != "Flat D, 11 Victoria Avenue"] + asset_list2 = asset_list[asset_list["address"] == "Flat D, 11 Victoria Avenue"] + + # Store the asset list in s3 + filename = f"{USER_ID}/{CAHA_PORTFOLIO_ID}/asset_list1.csv" + save_csv_to_s3( + dataframe=asset_list1, + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + filename2 = f"{USER_ID}/{CAHA_PORTFOLIO_ID}/asset_list2.csv" + save_csv_to_s3( + dataframe=asset_list2, + bucket_name="retrofit-plan-inputs-dev", + file_name=filename2 + ) + + # Store the non-invasive recommendations in s3 + non_invasive_recommendations_filename = f"{USER_ID}/{CAHA_PORTFOLIO_ID}/non_invasive_recommendations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(non_invasive_recommendations), + bucket_name="retrofit-plan-inputs-dev", + file_name=non_invasive_recommendations_filename + ) + + body = { + "portfolio_id": str(CAHA_PORTFOLIO_ID), + "housing_type": "Social", + "goal": "Increasing EPC", + "goal_value": "C", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": "", + "scenario_name": "Wave 3 Packages", + "multi_plan": True, + "budget": None, + "exclusions": ["boiler_upgrade"] + } + print(body) + + body2 = { + "portfolio_id": str(CAHA_PORTFOLIO_ID), + "housing_type": "Social", + "goal": "Increasing EPC", + "goal_value": "C", + "trigger_file_path": filename2, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": "", + "scenario_name": "Wave 3 Packages", + "multi_plan": True, + "budget": None, + "exclusions": ["boiler_upgrade"] + } + print(body2) + + # + asset_list3 = [ + { + "address": "10b Forest Gardens", "postcode": "N17 6XA", "uprn": 100021180197 + } + ] + filename3 = f"{USER_ID}/{CAHA_PORTFOLIO_ID}/asset_list3.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(asset_list3), + bucket_name="retrofit-plan-inputs-dev", + file_name=filename3 + ) + body3 = { + "portfolio_id": str(119), + "housing_type": "Social", + "goal": "Increasing EPC", + "goal_value": "C", + "trigger_file_path": filename3, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": "", + "valuation_file_path": "", + "scenario_name": "Wave 3 Packages", + "multi_plan": True, + "budget": None, + "exclusions": ["boiler_upgrade"] + } + print(body3) diff --git a/etl/customers/l_and_g/ic_asset_list.py b/etl/customers/l_and_g/ic_asset_list.py new file mode 100644 index 00000000..d0966bdf --- /dev/null +++ b/etl/customers/l_and_g/ic_asset_list.py @@ -0,0 +1,166 @@ +""" +This script prepares the asset list for modelling the properties from the L&Q dataset, for their January IC +""" + +import pandas as pd +import numpy as np + +from etl.route_march_data_pull.app import get_data +from utils.s3 import save_csv_to_s3 + +PORTFOLIO_ID = 124 +USER_ID = 8 + + +def app(): + asset_data = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/Basildon information for Domna/Basildon MDS v1.4 " + "(1).xlsx", + sheet_name="Basildon", + header=5 + ) + + asset_data = asset_data.head(-3) + + asset_data["address1"] = np.where( + pd.isnull(asset_data["Address 1"]), + asset_data["Address 2"], + asset_data["Address 1"] + ) + + asset_data["full_address"] = np.where( + pd.isnull(asset_data["Address 1"]), + asset_data["Address 2"] + ", " + asset_data["Address 3"], + asset_data["Address 1"] + ", " + asset_data["Address 2"] + ", " + asset_data["Address 3"], + ) + + asset_list = asset_data[["address1", "PostCode", "full_address", "Bedrooms"]] + + asset_list = asset_list.reset_index(drop=True) + + asset_list["row_id"] = asset_list.index + + # L&G's focus: + # Measures: loft and cavity insulation, replacement thermally efficient windows, PV cells, AS heat pumps. + + epc_data, errors, no_epc = get_data( + asset_list=asset_list, + fulladdress_column="full_address", + address1_column="address1", + postcode_column="PostCode", + manual_uprn_map={} + ) + + missed = asset_list[ + asset_list["row_id"].isin(no_epc) + ] + + # We merge on the property types, where we have them + missed = missed.merge( + asset_data[["address1", "PostCode", "Property Type"]], + how="left", + on=["address1", "PostCode"] + ) + # Remap Block: Residential to Flat + missed["Property Type"] = np.where( + missed["Property Type"] == "Block: Residential", + "Flat", + missed["Property Type"] + ) + + # We create the asset list - we have some properties that genuninely never had an EPC + + epc_df = pd.DataFrame(epc_data) + fetched_asset_list = epc_df[["address1", "postcode", "uprn", "row_id"]] + fetched_asset_list = fetched_asset_list.merge( + asset_list[["row_id", "Bedrooms"]], + how="left", + on=["row_id"] + ) + + missed = missed.rename(columns={"PostCode": "postcode"}).drop(columns=["row_id"]) + + # missed.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/missed_epcs.csv") + missed_uprns = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/missed_epcs_uprn.csv", + ) + + missed = missed.merge( + missed_uprns[["address1", "postcode", "UPRN"]].rename( + columns={"UPRN": "uprn"}, + ), + how="left", + on=["address1", "postcode"] + ) + + fetched_asset_list = fetched_asset_list.drop(columns=["row_id"]) + # We concatename them + final_asset_list = pd.concat( + [fetched_asset_list, missed[["address1", "postcode", "Property Type", "Bedrooms", "uprn"]]] + ) + + final_asset_list = final_asset_list.rename( + columns={ + "address1": "address", + "Property Type": "property_type", + "Bedrooms": "n_bedrooms" + } + ) + + # Finally, we merge on the numeber of bedrooms + + # Extract the non-invasive recommendations: + non_invasive_recommendations = [] + for x in epc_data: + non_invasive_recommendations.append( + { + "uprn": x["uprn"], + "recommendations": x["find_my_epc_data"]["recommendations"] + } + ) + + filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(final_asset_list), + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + # Store the non-invasive recommendations in s3 + non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(non_invasive_recommendations), + bucket_name="retrofit-plan-inputs-dev", + file_name=non_invasive_recommendations_filename + ) + + # Store the valuations data in s3 + # valuations_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuations.csv" + # save_csv_to_s3( + # dataframe=pd.DataFrame(valuations_data), + # bucket_name="retrofit-plan-inputs-dev", + # file_name=valuations_filename + # ) + + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Private", + "goal": "Increasing EPC", + "goal_value": "A", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": "", + "scenario_name": "Retrofit Packages", + "multi_plan": True, + "budget": None, + "inclusions": [ + "cavity_wall_insulation", + "loft_insulation", + "windows", + "solar_pv", + "air_source_heat_pump" + ] + } + print(body) diff --git a/etl/customers/l_and_g/ic_slides.py b/etl/customers/l_and_g/ic_slides.py new file mode 100644 index 00000000..a5cb3511 --- /dev/null +++ b/etl/customers/l_and_g/ic_slides.py @@ -0,0 +1,246 @@ +import pandas as pd +from backend.app.utils import sap_to_epc + +data = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/basildon_age_breakdowns/property_202501170837.csv" +) + +data["year_built"].value_counts() + +# 1950-1966 26 +# 1967-1975 37 +# 1976-1982 37 +# 1983-1990 33 +# 1991-1995 139 +# 1996-2002 42 +# 2003-2006 50 + +data["full_property_type"] = data["property_type"] + ": " + data["built_form"] + +houses = data[data["property_type"].isin(["House", "Bungalow"])] +houses["built_form"].value_counts() + +data["property_type"].value_counts() +data["full_property_type"].value_counts() +# House: Mid-Terrace 136 +# House: End-Terrace 83 +# House: Semi-Detached 55 +# Flat: Semi-Detached 24 +# Flat: End-Terrace 19 +# House: Detached 10 +# Flat: Mid-Terrace 9 +# Maisonette: Mid-Terrace 9 +# Maisonette: Semi-Detached 8 +# Maisonette: End-Terrace 6 +# Flat: Detached 4 +# Bungalow: Detached 1 + +epc_data = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/basildon_age_breakdowns/basildon EPC Data.csv" +) + +# Classify floor area in <73m2, 73-98, 99-200, 200+ +epc_data["floor_area_bracket"] = epc_data["total_floor_area"].apply( + lambda x: "<73" if x < 73 else "73-98" if x < 99 else "99-200" if x < 200 else "200+") + +# 73-98 185 +# <73 156 +# 99-200 23 + +epc_data["wall_type"] = epc_data["walls"].str.split(",").str[0] +epc_data["wall_type"].value_counts() + +# Cavity wall 343 +# Timber frame 15 +# System built 6 + +# we pull some additional data +# We want: +# 1) The list of properties included in the portfolio, with uprn +# 2) The recommendations against each property with costs, and whether or not the recommendation was defaulted +# 3) The properties without recommendations and why + +from tqdm import tqdm +import pandas as pd +import numpy as np +from sqlalchemy.orm import sessionmaker +from backend.app.db.connection import db_engine +from backend.app.db.models.recommendations import Recommendation, Plan, PlanRecommendations +from backend.app.db.models.portfolio import PropertyModel, PropertyDetailsEpcModel + + +def get_data(portfolio_id, scenario_ids): + session = sessionmaker(bind=db_engine)() + session.begin() + + # Get properties and their details for a specific portfolio + properties_query = session.query( + PropertyModel, + PropertyDetailsEpcModel + ).join( + PropertyDetailsEpcModel, PropertyModel.id == PropertyDetailsEpcModel.property_id + ).filter( + PropertyModel.portfolio_id == portfolio_id # Filter by portfolio ID + ).all() + + # Transform properties data to include all fields dynamically + properties_data = [ + {**{col.name: getattr(prop.PropertyModel, col.name) for col in PropertyModel.__table__.columns}, + **{col.name: getattr(prop.PropertyDetailsEpcModel, col.name) for col in + PropertyDetailsEpcModel.__table__.columns}} + for prop in properties_query + ] + + # Get property IDs from fetched properties + + # Get plans linked to the fetched properties + plans_query = session.query(Plan).filter(Plan.scenario_id.in_(scenario_ids)).all() + + # Transform plans data to include all fields dynamically + plans_data = [ + {col.name: getattr(plan, col.name) for col in Plan.__table__.columns} + for plan in plans_query + ] + + # Extract plan IDs for filtering recommendations through PlanRecommendations + plan_ids = [plan['id'] for plan in plans_data] + + # Get recommendations through PlanRecommendations for those plans and that are default + recommendations_query = session.query( + Recommendation, + Plan.scenario_id + ).join( + PlanRecommendations, Recommendation.id == PlanRecommendations.recommendation_id + ).join( + Plan, Plan.id == PlanRecommendations.plan_id # Join with Plan to access scenario_id + ).filter( + PlanRecommendations.plan_id.in_(plan_ids), + Recommendation.default == True # Filtering for default recommendations + ).all() + + # Transform recommendations data to include all fields dynamically and include scenario_id + recommendations_data = [ + {**{col.name: getattr(rec.Recommendation, col.name) if hasattr(rec, 'Recommendation') else getattr(rec, + col.name) for + col in Recommendation.__table__.columns}, + "Scenario ID": rec.scenario_id} + for rec in recommendations_query + ] + + session.close() + + return properties_data, plans_data, recommendations_data + + +properties_data, plans_data, recommendations_data = get_data(portfolio_id=124, scenario_ids=[205]) + +properties_df = pd.DataFrame(properties_data) +plans_df = pd.DataFrame(plans_data) +recommendations_df = pd.DataFrame(recommendations_data) + +recommended_measures_df = recommendations_df[ + ["property_id", "measure_type", "estimated_cost", "default"] +] +recommended_measures_df = recommended_measures_df[recommended_measures_df["default"]] +recommended_measures_df = recommended_measures_df.drop(columns=["default"]) + +post_install_sap = recommendations_df[["property_id", "default", "sap_points"]] +post_install_sap = post_install_sap[post_install_sap["default"]] +# Sum up the sap points by property id +post_install_sap = post_install_sap.groupby("property_id")[["sap_points"]].sum().reset_index() + +recommendations_measures_pivot = recommended_measures_df.pivot( + index='property_id', + columns='measure_type', + values='estimated_cost' +) +recommendations_measures_pivot = recommendations_measures_pivot.reset_index() + +recommendations_measures_pivot = recommendations_measures_pivot.rename( + columns={ + "air_source_heat_pump": "Cost: Air Source Heat Pump", + "cavity_wall_insulation": "Cost: Cavity Wall Insulation", + "double_glazing": "Cost: Double Glazing", + "loft_insulation": "Cost: Loft Insulation", + "mechanical_ventilation": "Cost: Ventilation", + "solar_pv": "Cost: Solar PV" + } +) +recommendations_measures_pivot = recommendations_measures_pivot.fillna(0) +recommendations_measures_pivot["Recommendation: Air Source Heat Pump"] = ( + recommendations_measures_pivot["Cost: Air Source Heat Pump"] > 0 +) +recommendations_measures_pivot["Recommendation: Cavity Wall Insulation"] = ( + recommendations_measures_pivot["Cost: Cavity Wall Insulation"] > 0 +) +recommendations_measures_pivot["Recommendation: Double Glazing"] = ( + recommendations_measures_pivot["Cost: Double Glazing"] > 0 +) +recommendations_measures_pivot["Recommendation: Loft Insulation"] = ( + recommendations_measures_pivot["Cost: Loft Insulation"] > 0 +) +recommendations_measures_pivot["Recommendation: Ventilation"] = ( + recommendations_measures_pivot["Cost: Ventilation"] > 0 +) +recommendations_measures_pivot["Recommendation: Solar PV"] = ( + recommendations_measures_pivot["Cost: Solar PV"] > 0 +) + +df = properties_df[ + [ + "property_id", "uprn", "address", "postcode", "property_type", "walls", "roof", "heating", "windows", + "current_epc_rating", + "current_sap_points", "total_floor_area", "number_of_rooms", + ] +].merge( + recommendations_measures_pivot, how="left", on="property_id" +).merge( + post_install_sap, how="left", on="property_id" +) + +df = df.drop(columns=["property_id"]) +df["sap_points"] = df["sap_points"].fillna(0) + +df = df.rename( + columns={ + "uprn": "UPRN", + "address": "Address", + "postcode": "Postcode", + "walls": "Walls", + "roof": "Roof", + "heating": "Heating", + "windows": "Windows", + "current_epc_rating": "Current EPC Rating", + "current_sap_points": "Current SAP Points", + "total_floor_area": "Total Floor Area", + "number_of_rooms": "Number of Habitable Rooms", + "floor_height": "Floor Height", + } +) + +df["Has Recommendations"] = ~pd.isnull(df["Cost: Air Source Heat Pump"]) + +# We fill missings: +for col in [ + "Recommendation: Air Source Heat Pump", "Recommendation: Cavity Wall Insulation", + "Recommendation: Double Glazing", "Recommendation: Loft Insulation", "Recommendation: Ventilation", + "Recommendation: Solar PV" +]: + df[col] = df[col].fillna(False) + +for col in [ + "Cost: Air Source Heat Pump", "Cost: Cavity Wall Insulation", + "Cost: Double Glazing", "Cost: Loft Insulation", "Cost: Ventilation", + "Cost: Solar PV" +]: + df[col] = df[col].fillna(0) + +# Calculate post SAP +df["Predicted Post Works SAP"] = df["Current SAP Points"] + df["sap_points"] +df["Predicted Post Works SAP"] = df["Predicted Post Works SAP"].round() +df["Predicted Post Works EPC"] = df["Predicted Post Works SAP"].apply(lambda x: sap_to_epc(x)) + +df["Recommendation: Air Source Heat Pump"].sum() +df["Cost: Air Source Heat Pump"].sum() + +df.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/Basildon Data Export - 2.csv", index=False) diff --git a/etl/customers/lambeth/re-knocks.py b/etl/customers/lambeth/re-knocks.py new file mode 100644 index 00000000..1de91b50 --- /dev/null +++ b/etl/customers/lambeth/re-knocks.py @@ -0,0 +1,23 @@ +import pandas as pd + +data = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/Lambeth Reknocks.xlsx", sheet_name="Possible Route", + header=1 +) + +data["Outcomes"].value_counts() + +# Strip out: No + +df = data[data["Outcomes"] == "See notes"] +notes_df = df[ + ("Notes (If 'no answer' under outcomes, have you checked around the property for access issues where " + "possible?)")].value_counts().to_frame() + +example = df[df["Notes (If 'no answer' under outcomes, have you checked around the property for access issues where " + "possible?)"] == ('Access to rear of property only through number 10. Overgrown athe rear of property ' + 'installer wont be able to access') + ] + +# 18 did not attend +# diff --git a/etl/customers/livewest/route_march_2024_10_28.py b/etl/customers/livewest/route_march_2024_10_28.py new file mode 100644 index 00000000..1b259fba --- /dev/null +++ b/etl/customers/livewest/route_march_2024_10_28.py @@ -0,0 +1,225 @@ +import os +import time + +import pandas as pd +from tqdm import tqdm + +from dotenv import load_dotenv +from utils.s3 import read_excel_from_s3 +from backend.SearchEpc import SearchEpc +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes + +from recommendations.recommendation_utils import ( + estimate_perimeter, + estimate_external_wall_area, + estimate_number_of_floors +) + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def get_data(asset_list): + epc_data = [] + errors = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + try: + postcode = home["Postcode"] + house_number = home["Number"] + full_address = home["Full Address"] + + searcher = SearchEpc( + address1=str(house_number), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + epc = { + "row_id": home["row_id"], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"] + } + + epc_data.append(epc) + except Exception as e: + errors.append(home["row_id"]) + time.sleep(5) + + return epc_data, errors + + +def app(): + """ + This app is EPC pulling data for some properties owned by Livewest + + Data request contents: + Date of last EPC + Reason for EPC + SAP score on register + Property Type + Property Area + Property Age + Any Dimensions (HLP,PW,RH) + Property Wall Construction + Heating Type + Secondary Heating + Loft Insulation Depth + + Additional if possible: + Heat loss calculations + EPC recommendations + Property UPRN + + """ + asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/LIVEWEST 3578 ECO4 ECO PLUS GBIS.xlsx", header=0 + ) + asset_list["row_id"] = asset_list.index + + epc_data, errors = get_data(asset_list) + + # We now retrieve any failed properties + asset_list_failed = asset_list[asset_list["row_id"].isin(errors)] + epc_data_failed, _ = get_data(asset_list_failed) + + # Append the failed data to the main data + epc_data.extend(epc_data_failed) + + epc_df = pd.DataFrame(epc_data) + + # We expand out the recommendations + recommendations_df = epc_df[["row_id", "recommendations"]] + + unique_recommendations = set() + for _, row in recommendations_df.iterrows(): + unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) + + columns = ["row_id"] + list(unique_recommendations) + transformed_data = [] + for _, row in recommendations_df.iterrows(): + # Initialize a dictionary for this row with False for all recommendations + row_data = {col: False for col in columns} + row_data["row_id"] = row["row_id"] + + # Set True for each recommendation present in this row + for rec in row["recommendations"]: + recommendation_text = rec["improvement-summary-text"] + row_data[recommendation_text] = True + + # Append the row data to transformed_data + transformed_data.append(row_data) + + transformed_df = pd.DataFrame(transformed_data) + # Drop the column that is "" + transformed_df = transformed_df.drop(columns=[""]) + + # Retrieve just the data we need + epc_df = epc_df[ + [ + "row_id", + "uprn", + "property-type", + "built-form", + "inspection-date", + "current-energy-rating", + "current-energy-efficiency", + "roof-description", + "walls-description", + "transaction-type", + # New fields needed + "secondheat-description", + "total-floor-area", + "construction-age-band", + "floor-height", + "number-habitable-rooms", + "mainheat-description", + # + "energy-consumption-current", # kwh/m2 + ] + ] + + asset_list = asset_list.merge( + epc_df, + how="left", + on="row_id" + ).merge( + transformed_df, + how="left", + on="row_id" + ) + + asset_list = asset_list.drop(columns=["row_id"]) + + # Rename the columns + asset_list = asset_list.rename(columns={ + "inspection-date": "Date of last EPC", + "current-energy-efficiency": "SAP score on register", + "current-energy-rating": "EPC rating on register", + "property-type": "Property Type", + "built-form": "Archetype", + "total-floor-area": "Property Floor Area", + "construction-age-band": "Property Age Band", + "floor-height": "Property Floor Height", + "number-habitable-rooms": "Number of Habitable Rooms", + "walls-description": "Wall Construction", + "roof-description": "Roof Construction", + "mainheat-description": "Heating Type", + "secondheat-description": "Secondary Heating", + "transaction-type": "Reason for last EPC", + "energy-consumption-current": "Heat Demand (kWh/m2)" + }) + + asset_list["Estimated Number of Floors"] = asset_list.apply( + lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull( + x["Property Type"]) else None, axis=1 + ) + + asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float) + # Replace "" value with None + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None) + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float) + + asset_list["Estimated Perimeter (m)"] = asset_list.apply( + lambda x: estimate_perimeter( + floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"], + num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"], + ), axis=1 + ) + + asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply( + lambda x: estimate_external_wall_area( + num_floors=x["Estimated Number of Floors"], + floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, + perimeter=x["Estimated Perimeter (m)"], + built_form=x["Archetype"] + ), + axis=1 + ) + + asset_list["Roof Insulation Thickness"] = asset_list.apply( + lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull( + x["Roof Construction"]) else None, + axis=1 + ) + + # Store as an excel + filename = "livewest EPC Data pull - 29 Oct.xlsx" + asset_list.to_excel(filename, index=False) diff --git a/etl/customers/mod/pilot/1. Create Sample.py b/etl/customers/mod/pilot/1. Create Sample.py new file mode 100644 index 00000000..fd045294 --- /dev/null +++ b/etl/customers/mod/pilot/1. Create Sample.py @@ -0,0 +1,205 @@ +import os +import pandas as pd +from tqdm import tqdm +from dotenv import load_dotenv +from backend.SearchEpc import SearchEpc +from etl.spatial.OpenUprnClient import OpenUprnClient +from asset_list.utils import get_data +from utils.s3 import save_csv_to_s3 + +PORTFOLIO_ID = 139 +USER_ID = 8 + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def app(): + """ + Given the sample data and additonal properties, this function prepares the data + :return: + """ + folder_path = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme" + sample_list = pd.read_excel(f"{folder_path}/20250227_DIO_Accommodation_Sample_Properties.xlsx") + asset_data = pd.read_excel(f"{folder_path}/20250303_DIO_Accommodation_Property_Attribution.xlsx") + + sample_list = sample_list[sample_list["BLDNG_COUNTRY_NAME"].isin(["ENGLAND", "WALES"])] + + # Merge on the UPRN + sample_list = sample_list.merge( + asset_data[["BLDNG_ID", "BLNDG_GOVERMENT_UPRN"]].drop_duplicates(), + how="left", on="BLDNG_ID" + ) + sample_list["BLNDG_GOVERMENT_UPRN"] = sample_list["BLNDG_GOVERMENT_UPRN"].astype("Int64") + + # Use the EPC API to get corrected postcodes + model_asset_list = [] + missed = [] + for _, x in tqdm(sample_list.iterrows(), total=len(sample_list)): + + if pd.isnull(x["BLNDG_GOVERMENT_UPRN"]): + continue + searcher = SearchEpc( + address1="", + postcode="", + uprn=x["BLNDG_GOVERMENT_UPRN"], + auth_token=EPC_AUTH_TOKEN, + os_api_key="" + ) + searcher.find_property(skip_os=True) + newest_epc = searcher.newest_epc + if newest_epc is None: + missed.append(x["BLNDG_GOVERMENT_UPRN"]) + continue + + model_asset_list.append(newest_epc) + + model_asset_list = pd.DataFrame(model_asset_list) + model_asset_list["uprn"] = model_asset_list["uprn"].astype(int) + + spatial_data = OpenUprnClient.get_spatial_data( + uprns=model_asset_list["uprn"].tolist(), bucket_name="retrofit-data-dev" + ) + + # We determine if the building is listed, heritage or in a conservation area + + # Merge on the property features + features = asset_data.drop( + columns=["BUILDING_SYSTEM_ITEM_NAME", "OBSERVED_CONDITION_DESCRIPTION"] + ).drop_duplicates() + + df = features.merge( + model_asset_list, how="inner", right_on="uprn", left_on="BLNDG_GOVERMENT_UPRN" + ).merge( + pd.DataFrame(spatial_data).rename(columns={"UPRN": "uprn"}), how="left", on="uprn" + ) + + # Store data locally + # df.to_csv(folder_path + "/MOD property data.csv", index=False) + + # Produce as asset list for analysis + + df["row_id"] = df.index + + epc_data, errors, no_epc = get_data( + df=df, + manual_uprn_map={}, + epc_auth_token=EPC_AUTH_TOKEN, + uprn_column="uprn", + fulladdress_column="address", + address1_column="address1", + postcode_column="postcode", + property_type_column=None, + built_form_column=None, + epc_api_only=False, + row_id_name="row_id", + ) + + non_invasive_recommendations = [] + for x in epc_data: + non_invasive_recommendations.append( + { + "uprn": x["uprn"], + "recommendations": x["find_my_epc_data"]["recommendations"] + } + ) + + # also include the floor area + asset_list = df[ + ["uprn", "address1", "postcode", "NUMBER_OF_BEDROOMS", "BLDNG_STOREYS_QTY", "BLDNG_MSRMNT_VAL"] + ].rename( + columns={ + "address1": "address", + "NUMBER_OF_BEDROOMS": "n_bedrooms", + "BLDNG_STOREYS_QTY": "number_of_floors", + "BLDNG_MSRMNT_VAL": "floor_area" + } + ) + + filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv" + save_csv_to_s3( + dataframe=asset_list, + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + # Store the non-invasive recommendations in s3 + non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(non_invasive_recommendations), + bucket_name="retrofit-plan-inputs-dev", + file_name=non_invasive_recommendations_filename + ) + + # Scenario 1 - EPC C + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Private", + "goal": "Increasing EPC", + "goal_value": "C", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": "", + "scenario_name": "Hit EPC C", + "multi_plan": True, + "budget": None, + # "inclusions": [ + # "cavity_wall_insulation", + # "loft_insulation", + # "windows", + # "solar_pv", + # "air_source_heat_pump" + # ] + } + print(body) + + # Scenario 2 - EPC B + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Private", + "goal": "Increasing EPC", + "goal_value": "B", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": "", + "scenario_name": "Hit EPC B", + "multi_plan": True, + "budget": None, + # "inclusions": [ + # "cavity_wall_insulation", + # "loft_insulation", + # "windows", + # "solar_pv", + # "air_source_heat_pump" + # ] + } + print(body) + + # Scenario 3 - EPC B, 3.5 COP ASHP + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Private", + "goal": "Increasing EPC", + "goal_value": "B", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": "", + "scenario_name": "Hit EPC B - 3.5 COP ASHP", + "multi_plan": True, + "budget": None, + "ashp_cop": 3.5 + # "inclusions": [ + # "cavity_wall_insulation", + # "loft_insulation", + # "windows", + # "solar_pv", + # "air_source_heat_pump" + # ] + } + print(body) diff --git a/etl/customers/mod/pilot/2. Create Excel Model.py b/etl/customers/mod/pilot/2. Create Excel Model.py new file mode 100644 index 00000000..9a9eda86 --- /dev/null +++ b/etl/customers/mod/pilot/2. Create Excel Model.py @@ -0,0 +1,652 @@ +from pprint import pprint +import pandas as pd +import numpy as np +from backend.app.utils import sap_to_epc +from sqlalchemy.orm import sessionmaker +from backend.app.db.connection import db_engine +from backend.app.db.models.recommendations import Recommendation, Plan, PlanRecommendations +from backend.app.db.models.portfolio import PropertyModel, PropertyDetailsEpcModel + + +def get_data(portfolio_id, scenario_ids): + session = sessionmaker(bind=db_engine)() + session.begin() + + # Get properties and their details for a specific portfolio + properties_query = session.query( + PropertyModel, + PropertyDetailsEpcModel + ).join( + PropertyDetailsEpcModel, PropertyModel.id == PropertyDetailsEpcModel.property_id + ).filter( + PropertyModel.portfolio_id == portfolio_id # Filter by portfolio ID + ).all() + + # Transform properties data to include all fields dynamically + properties_data = [ + {**{col.name: getattr(prop.PropertyModel, col.name) for col in PropertyModel.__table__.columns}, + **{col.name: getattr(prop.PropertyDetailsEpcModel, col.name) for col in + PropertyDetailsEpcModel.__table__.columns}} + for prop in properties_query + ] + + # Get property IDs from fetched properties + + # Get plans linked to the fetched properties + plans_query = session.query(Plan).filter(Plan.scenario_id.in_(scenario_ids)).all() + + # Transform plans data to include all fields dynamically + plans_data = [ + {col.name: getattr(plan, col.name) for col in Plan.__table__.columns} + for plan in plans_query + ] + + # Extract plan IDs for filtering recommendations through PlanRecommendations + plan_ids = [plan['id'] for plan in plans_data] + + # Get recommendations through PlanRecommendations for those plans and that are default + recommendations_query = session.query( + Recommendation, + Plan.scenario_id + ).join( + PlanRecommendations, Recommendation.id == PlanRecommendations.recommendation_id + ).join( + Plan, Plan.id == PlanRecommendations.plan_id # Join with Plan to access scenario_id + ).filter( + PlanRecommendations.plan_id.in_(plan_ids), + Recommendation.default == True # Filtering for default recommendations + ).all() + + # Transform recommendations data to include all fields dynamically and include scenario_id + recommendations_data = [ + {**{col.name: getattr(rec.Recommendation, col.name) if hasattr(rec, 'Recommendation') + else getattr(rec, col.name) for + col in Recommendation.__table__.columns}, + "Scenario ID": rec.scenario_id} + for rec in recommendations_query + ] + + session.close() + + return properties_data, plans_data, recommendations_data + + +def app(): + """ + Given a portfolio and a scenario, this function prepares an excel model to present the data + """ + + # Set the inputs: + portfolio_id = 139 + scenario_ids = [237, 238] + + properties_data, plans_data, recommendations_data = get_data( + portfolio_id=portfolio_id, scenario_ids=scenario_ids + ) + + properties_df = pd.DataFrame(properties_data) + plans_df = pd.DataFrame(plans_data) + recommendations_df = pd.DataFrame(recommendations_data) + + # Merge on the orignal data + mod_property_data = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/MOD property data.csv" + ) + + property_asset_data = properties_df.merge( + mod_property_data.drop(columns=["address", "postcode", "tenure"]), how="left", on="uprn" + ) + + property_asset_data["is_pitched"] = property_asset_data["roof"].str.contains("pitched", case=False) + property_asset_data["pre_1970"] = property_asset_data["BUILD_YEAR"] < 1970 + property_asset_data["wall_type"] = property_asset_data["walls"].str.split(" ").str[0].str.strip() + property_asset_data["is_insulated"] = ( + property_asset_data["walls"].str.split(",").str[1].str.strip().isin( + ["filled cavity", "with external insulation", "filled cavity and external insulation"] + ) | property_asset_data["walls"].str.split(",").str[2].str.strip().isin(["insulated"]) + ) + property_asset_data["is_insulated"] = np.where( + property_asset_data["is_insulated"], "Insulated", "Uninsulated" + ) + property_asset_data["is_pitched"] = np.where( + property_asset_data["is_pitched"], "Pitched roof", "Not Pitched Roof" + ) + property_asset_data["pre_1970"] = np.where( + property_asset_data["pre_1970"], "Pre 1970", "Post 1970" + ) + + archetype_variables = ["property_type", "wall_type", "is_insulated", "is_pitched", "pre_1970"] + + assigned_archetypes = ( + property_asset_data.groupby( + archetype_variables + ).size().reset_index().rename(columns={0: "n_properties"}).sort_values("n_properties", ascending=False) + ) + + # Make the archetype ID a concatenation of the variables + assigned_archetypes["archetype_id"] = assigned_archetypes[archetype_variables].apply( + lambda x: "_".join(x.astype(str)), axis=1 + ) + + # Most prominent archetypes + prominent_archetypes = assigned_archetypes.head(6) + other_archetypes = assigned_archetypes.tail(-6) + # 2 or fewer properties in the other archetypes + + property_asset_data = property_asset_data.merge( + assigned_archetypes[archetype_variables + ["archetype_id"]], + how="left", + on=archetype_variables + ) + + # Create age bands: + # 1960-1969 + # 1970-1979 + # 1980-1989 + # 1990-1999 + # 2000+ + property_asset_data["age_band"] = pd.cut( + property_asset_data["BUILD_YEAR"], + bins=[1959, 1969, 1979, 1989, 1999, 2022], + labels=["1960-1969", "1970-1979", "1980-1989", "1990-1999", "2000+"] + ) + + # Create floor area bands + # 0-73 + # 74-97 + # 98-199 + # 200+ + property_asset_data["floor_area_band"] = pd.cut( + property_asset_data["total_floor_area"], + bins=[0, 73, 97, 199, 10000], + labels=["0-73", "74-97", "98-199", "200+"] + ) + + property_asset_data["archetype_group"] = property_asset_data["archetype_id"].copy() + property_asset_data["archetype_group"] = np.where( + property_asset_data["archetype_id"].isin(other_archetypes["archetype_id"].values), + "other", + property_asset_data["archetype_group"] + ) + + # For colour + wall_types = ( + property_asset_data[["wall_type"]].value_counts().to_frame().reset_index().rename( + columns={"wall_type": "Wall Type"} + ) + ) + # Group into age bands + ages = ( + property_asset_data[["age_band"]].value_counts() + .to_frame() + .reset_index().sort_values("age_band", ascending=True) + .rename(columns={"age_band": "Age Band"}) + ) + floor_area_bands = ( + property_asset_data[["floor_area_band"]].value_counts() + .to_frame() + .reset_index().sort_values("floor_area_band", ascending=True) + .rename(columns={"floor_area_band": "Floor Area Band"}) + ) + archetype_counts = ( + property_asset_data[["archetype_group"]]. + value_counts(). + to_frame(). + reset_index() + .rename(columns={"archetype_group": "Archetype"}) + ) + property_types = ( + (property_asset_data["property_type"] + ": " + property_asset_data["built_form"]). + value_counts(). + to_frame(). + reset_index() + .rename(columns={"index": "Property Type", 0: "Count"}) + ) + + # epc breakdown + epc_breakdown = ( + property_asset_data["current_epc_rating"] + .apply(lambda x: x.value) + .value_counts() + .to_frame() + .reset_index() + ) + + # Figures for the deck + # Carbon per property + totals = property_asset_data[ + [ + "Total_household_members", + "co2_emissions", "current_energy_demand", "current_energy_demand_heating_hotwater", + "heating_cost_current", "hot_water_cost_current", "lighting_cost_current", + "appliances_cost_current", "gas_standing_charge", "electricity_standing_charge" + ] + ].copy() + totals["total_cost"] = ( + totals["heating_cost_current"] + + totals["hot_water_cost_current"] + + totals["lighting_cost_current"] + + totals["appliances_cost_current"] + + totals["gas_standing_charge"] + + totals["electricity_standing_charge"] + ) + print( + totals[ + [ + "Total_household_members", + "co2_emissions", + "current_energy_demand", + "total_cost", + ] + ].mean() + ) + + # Store these to an excel + # with pd.ExcelWriter( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/MOD archetype breakdowns.xlsx" + # ) as writer: + # wall_types.to_excel(writer, sheet_name="Wall Types", index=False) + # ages.to_excel(writer, sheet_name="Ages", index=False) + # floor_area_bands.to_excel(writer, sheet_name="Floor Area Bands", index=False) + # archetype_counts.to_excel(writer, sheet_name="Archetype Counts", index=False) + # epc_breakdown.to_excel(writer, sheet_name="EPC Rating", index=False) + + contingency = 0.26 + + # We prepare the outputs, by scenario + scenario_data = {} + for scenario in scenario_ids: + + scenario_recommendations_df = recommendations_df[ + recommendations_df["Scenario ID"] == scenario + ].copy() + + scenario_recommendations_df["contingency"] = contingency * scenario_recommendations_df["estimated_cost"] + scenario_recommendations_df["total_cost"] = ( + scenario_recommendations_df["estimated_cost"] + scenario_recommendations_df["contingency"] + ) + + recommended_measures_df = scenario_recommendations_df[ + ["property_id", "measure_type", "estimated_cost", "default"] + ] + + recommended_measures_df = recommended_measures_df[recommended_measures_df["default"]] + recommended_measures_df = recommended_measures_df.drop(columns=["default"]) + + # Metrics by property ID + aggregated_metrics = scenario_recommendations_df[ + [ + "property_id", "type", "default", "sap_points", + "energy_cost_savings", "kwh_savings", "co2_equivalent_savings", "estimated_cost", "contingency", + "total_cost" + ] + ] + aggregated_metrics = aggregated_metrics[aggregated_metrics["default"]] + aggregated_metrics = aggregated_metrics.groupby("property_id")[ + ["sap_points", "co2_equivalent_savings", "energy_cost_savings", "kwh_savings", "estimated_cost", + "total_cost", "contingency"] + ].sum().reset_index() + + recommendations_measures_pivot = recommended_measures_df.pivot( + index='property_id', + columns='measure_type', + values='estimated_cost' + ) + recommendations_measures_pivot = recommendations_measures_pivot.reset_index() + recommendations_measures_pivot = recommendations_measures_pivot.fillna(0) + + # We flag with boolean if the measure is recommended + for c in recommendations_measures_pivot.columns: + if c == "property_id": + continue + recommendations_measures_pivot["Recommendation: " + c] = recommendations_measures_pivot[c] > 0 + + # We now create a final output + df = properties_df[ + [ + "property_id", "uprn", "address", "postcode", "property_type", "walls", "roof", "heating", "windows", + "current_epc_rating", "current_sap_points", "total_floor_area", "number_of_rooms", + "co2_emissions", "current_energy_demand", "current_energy_demand_heating_hotwater", + "heating_cost_current", "hot_water_cost_current", "lighting_cost_current", + "appliances_cost_current", "gas_standing_charge", "electricity_standing_charge" + ] + ].merge( + recommendations_measures_pivot, how="left", on="property_id" + ).merge( + aggregated_metrics, how="left", on="property_id" + ) + + df["bills_total_cost"] = ( + df["heating_cost_current"] + df["hot_water_cost_current"] + df["lighting_cost_current"] + + df["appliances_cost_current"] + df["gas_standing_charge"] + df["electricity_standing_charge"] + ) + + df = df.drop(columns=["property_id"]) + for c in ["sap_points", "co2_equivalent_savings", "energy_cost_savings", "kwh_savings"]: + df[c] = df[c].fillna(0) + + df = df.rename( + columns={ + "uprn": "UPRN", + "address": "Address", + "postcode": "Postcode", + "walls": "Walls", + "roof": "Roof", + "heating": "Heating", + "windows": "Windows", + "current_epc_rating": "Current EPC Rating", + "current_sap_points": "Current SAP Points", + "total_floor_area": "Total Floor Area", + "number_of_rooms": "Number of Habitable Rooms", + "floor_height": "Floor Height", + } + ) + + # Calculate post SAP + df["Predicted Post Works SAP"] = df["Current SAP Points"] + df["sap_points"] + df["Predicted Post Works SAP"] = df["Predicted Post Works SAP"].round() + df["Predicted Post Works EPC"] = df["Predicted Post Works SAP"].apply(lambda x: sap_to_epc(x)) + + # Calculate the relative savings on carbon, kwh, and bills + df["relative_carbon_savings"] = df["co2_equivalent_savings"] / df["co2_emissions"] + df["relative_kwh_savings"] = df["kwh_savings"] / df["current_energy_demand"] + df["relative_bill_savings"] = df["energy_cost_savings"] / df["bills_total_cost"] + + # Add on the archetype + df = df.merge( + property_asset_data[["uprn", "archetype_group"]], how="left", left_on="UPRN", right_on="uprn" + ) + + # For properties that don't make it to EPC B, check why. E.g. for a property that has an oil boiler, it + # the bills go up recommending HHRSH, so it doesn't make it to EPC B + # For mid-terrace units, use the ordnance survey API to check if there is space for a heat pump? + # DO it manually??? + + # Doesn't make it + # misses = df[df["Predicted Post Works EPC"] == "C"] + # # 5 of them are flats and so are difficult to get to EPC B without renewables. Possibly not worth it from an + # # ROI perspective + # + # misses[["UPRN", "Address", "Postcode", "property_type"]] + + # UPRN Address Postcode property_type + # 2 100120988937 13 Sidbury Circular Road SP9 7HX Flat No further action + # 3 100120988998 74 Sidbury Circular Road SP9 7JA Flat No further action + # 4 100120989416 47 Zouch Avenue SP9 7LR Flat No further action + # 6 100060585002 42, Muscott Close, Shipton Bellinger SP9 7TX House Can probably take a heat pump + # 37 10000801072 34 Luffenham Place, Chicksands SG17 5XH House Already surveyed as having + # an ASHP - should be looked at + # 121 100120988259 8, Karachi Close SP9 7LW Flat + # 122 100121101217 599, Pepper Place BA12 0DW Flat + # 140 100021455241 33 Blenheim Crescent, Ruislip HA4 7HA House - Solar isnt recommended + # due to bug + # 149 100120915656 10 Bower Green, Shrivenham SN6 8TU House - Solar isn't recommended + # due to bug + + scenario_data[scenario] = df + + printing_scenario_id = scenario_ids[0] + # EPC breakdown + print(scenario_data[printing_scenario_id]['Predicted Post Works EPC'].value_counts()) + # Cost + # Total cost + print(scenario_data[printing_scenario_id]["total_cost"].sum()) + # Base cost + print(scenario_data[printing_scenario_id]["estimated_cost"].sum()) + # Contingency + print(scenario_data[printing_scenario_id]["contingency"].sum()) + # Costs averaged per unit + print(scenario_data[printing_scenario_id]["total_cost"].mean()) + print(scenario_data[printing_scenario_id]["estimated_cost"].mean()) + print(scenario_data[printing_scenario_id]["contingency"].mean()) + + # Average relative savings + print(scenario_data[printing_scenario_id]["relative_carbon_savings"].mean()) + print(scenario_data[printing_scenario_id]["relative_kwh_savings"].mean()) + print(scenario_data[printing_scenario_id]["relative_bill_savings"].mean()) + + measure_details = {} + for scenario in scenario_ids: + measure_details[scenario] = {} + recommendation_cols = [c for c in scenario_data[scenario].columns if "Recommendation:" in c] + measure_details[scenario]["count"] = scenario_data[scenario][recommendation_cols].sum().to_dict() + # Get average cost per measure + measure_columns = [ + c.split("Recommendation: ")[1] for c in scenario_data[scenario].columns if "Recommendation:" in c + ] + # Take the mean, drop zero columns + measure_costs = {} + for m in measure_columns: + measure_costs[m] = float(scenario_data[scenario][scenario_data[scenario][m] > 0][m].mean()) + measure_details[scenario]["cost_per_measure"] = measure_costs + + pprint(measure_details[scenario_ids[0]]["count"]) + pprint(measure_details[scenario_ids[1]]["count"]) + + # Cost per measures + pprint(measure_details[scenario_ids[0]]["cost_per_measure"]) + pprint(measure_details[scenario_ids[1]]["cost_per_measure"]) + + # Do not get to EPC B: + # 5 are flats + # 1) 34 Luffenham Place, Chicksands SG17 5XH, has been surveyed as having a low performing heat pump - + # should be looked at but several surrounding properties have been surveyed in a similar fashion + # 2) 42, Muscott Close, Shipton Bellinger SP9 7TX, has an oil boiler and the bills go up recommending HHRSH. + # we could non-intrusively recommend a heat pump. + # 3) 33 Blenheim Crescent, Ruislip, HA4 7HA, 100021455241 Solar potential modelling returned nothing - + # manual review indicates that there are multiple trees surrouding the south facing side of the property + # 4) 10 Bower Green, Shrivenham, SN6 8TU - Solar isn't recommended without further survey due to the local + # area being surrounded by trees + + # Scenario adjustments: + # Exclude: boiler_upgrade + # Make ASHP COP 3.5 + + # Metrics we need by scenario: + # Cost + # contingency + # Carbon + # kwh + # bill savings + scenario_metrics = {} + for scenario in scenario_ids: + df = scenario_data[scenario].copy() + + avg_savings = df[ + ["sap_points", "co2_equivalent_savings", "energy_cost_savings", "kwh_savings", "estimated_cost", + "total_cost", "contingency"] + ].mean().to_dict() + avg_savings["cost_per_sap_point"] = avg_savings["total_cost"] / avg_savings["sap_points"] + avg_savings["cost_per_carbon"] = avg_savings["total_cost"] / avg_savings["co2_equivalent_savings"] + scenario_metrics[scenario] = avg_savings + + pprint(scenario_metrics[scenario_ids[0]]) + pprint(scenario_metrics[scenario_ids[1]]) + + scenario_data[scenario_ids[0]]["loft_insulation"][ + scenario_data[scenario_ids[0]]["loft_insulation"] > 0 + ].mean() + + scenario_data[scenario_ids[0]]["cavity_wall_insulation"][ + scenario_data[scenario_ids[0]]["cavity_wall_insulation"] > 0 + ].mean() + + # Testing checking floor risk + + import requests + + def get_flood_risk(lat, lon, radius_km=1): + url = "https://environment.data.gov.uk/flood-monitoring/id/floods" + params = { + 'lat': lat, + 'long': lon, + 'dist': radius_km # search radius in km + } + + response = requests.get(url, params=params) + response.raise_for_status() + data = response.json() + + flood_warnings = data.get("items", []) + + if not flood_warnings: + print("No active flood warnings near this location.") + else: + print(f"{len(flood_warnings)} warning(s) found near the location:") + for warning in flood_warnings: + print(f"- Area: {warning.get('description')}") + print(f" Severity: {warning.get('severity')} (Level {warning.get('severityLevel')})") + print(f" Message changed at: {warning.get('timeMessageChanged')}") + print() + + return flood_warnings + + from shapely.geometry import shape, Point + def get_flood_areas_near_point(lat, lon, radius_km=2): + url = "https://environment.data.gov.uk/flood-monitoring/id/floodAreas" + params = { + 'lat': lat, + 'long': lon, + 'dist': radius_km + } + + response = requests.get(url, params=params) + response.raise_for_status() + return response.json().get("items", []) + + def point_in_flood_area(lat, lon): + flood_areas = get_flood_areas_near_point(lat, lon, radius_km=1) + point = Point(lon, lat) # GeoJSON uses (lon, lat) format + + for area in flood_areas: + polygon_url = area.get("polygon") + if not polygon_url: + continue + + polygon_response = requests.get(polygon_url) + polygon_response.raise_for_status() + polygon_geojson = polygon_response.json() + + features = polygon_geojson.get("features", []) + if not features: + continue + + flood_polygon = shape(features[0]['geometry']) + + try: + is_inside = flood_polygon.contains(point) + except: + is_inside = False + + if is_inside: + print(f"📍 Point is inside flood area: {area['label']} ({area['notation']})") + return area + + from tqdm import tqdm + floor_warnings_data = [] + for _, property in tqdm(property_asset_data.iterrows(), total=len(property_asset_data)): + # warnings = floor_warnings_data.extend( + # get_flood_risk(lat=property["LATITUDE"], lon=property["LONGITUDE"], radius_km=1) + # ) + + resp = point_in_flood_area(lat=property["LATITUDE"], lon=property["LONGITUDE"]) + if resp: + floor_warnings_data.append( + { + "uprn": property["uprn"], + "address": property["address"], + "postcode": property["postcode"], + "area": resp + } + ) + continue + + import plotly.graph_objects as go + + labels = [ + "House_Cavity_Insulated_Pitched roof_Pre 1970", + "House_Cavity_Insulated_Pitched roof_Post 1970", + "House_Cavity_Uninsulated_Pitched roof_Pre 1970", + "House_Cavity_Uninsulated_Pitched roof_Post 1970", + "other", + "House_System_Uninsulated_Pitched roof_Pre 1970", + "House_Solid_Uninsulated_Not Pitched Roof_Pre 1970" + ] + + values = [62, 36, 21, 16, 16, 4, 2] + + hovertext = [ + "Loft insulation, draft proofing", + "Top-up loft insulation", + "Cavity wall insulation, loft insulation", + "Cavity wall insulation, ventilation", + "Bespoke retrofit measures", + "External wall insulation, roof insulation", + "Flat roof insulation, internal wall insulation" + ] + + fig = go.Figure(go.Treemap( + labels=labels, + parents=[""] * len(labels), # No root + values=values, + hovertext=hovertext, + hoverinfo="text", + textinfo="none", + marker=dict( + line=dict(color="white", width=4), + colors=values, + colorscale="Blues" + ) + )) + + fig.update_layout( + margin=dict(t=10, l=10, r=10, b=10), + plot_bgcolor="white", + paper_bgcolor="white" + ) + + fig.show() + + # Get the recommended measures by scenario id + recommendation_cols = [c for c in scenario_data[scenario_ids[1]].columns if "Recommendation:" in c] + measure_counts_by_scenario = scenario_data[scenario_ids[1]].groupby("archetype_group")[ + recommendation_cols + ].sum().reset_index() + + measure_counts_by_scenario.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/measure_counts_by_scenario.csv" + ) + + # Estimate average valuation improvment by scenarios + valuation_data = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/property_valuation.csv" + ) + + from backend.ml_models.Valuation import PropertyValuation + + uplift = [] + for _, x in valuation_data.iterrows(): + uprn = x["uprn"] + + to_append = {"uprn": uprn} + for _id in scenario_ids: + scenario = scenario_data[_id][ + scenario_data[_id]["uprn"] == uprn + ].squeeze() + + val = PropertyValuation.estimate_valuation_improvement( + current_value=x["valuation"], + current_epc=scenario["Current EPC Rating"].value, + target_epc=scenario["Predicted Post Works EPC"], + total_cost=None + ) + + to_append[_id] = val["average_increase"] + + uplift.append(to_append) + + uplift = pd.DataFrame(uplift) + print(uplift[scenario_ids[0]].mean()) + # £8,161 + print(uplift[scenario_ids[1]].mean()) + # £16,938 diff --git a/etl/customers/mod/pilot/3. Past Project Costs.py b/etl/customers/mod/pilot/3. Past Project Costs.py new file mode 100644 index 00000000..79a0493c --- /dev/null +++ b/etl/customers/mod/pilot/3. Past Project Costs.py @@ -0,0 +1,76 @@ +import pandas as pd + +# Get the wave 2 costing data and produce some breakdowns +costs = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/Measure cost study for MOD.xlsx", + header=2 +) + +# Get the EPC data for these + + +# Cavity +cwi_costs = costs[ + ['Model', 'Total invoiced (including VAT)'] +].copy() +cwi_costs["Model"] = "CWI - " + cwi_costs["Model"] +cwi_costs = cwi_costs[~pd.isnull(cwi_costs["Total invoiced (including VAT)"])] + +# Loft +li_costs = costs[ + ['Model.2', 'Total invoiced (including VAT).2'] +].copy() +li_costs["Model.2"] = "LI - " + li_costs["Model.2"] +li_costs = li_costs[~pd.isnull(li_costs["Total invoiced (including VAT).2"])] +# Rename +li_costs.columns = ["Model", "Total invoiced (including VAT)"] + +# Windows +windows_costs = costs[ + ['Model.3', 'Total invoiced (including VAT).3'] +].copy() +windows_costs["Model.3"] = "Windows - " + windows_costs["Model.3"] +windows_costs = windows_costs[~pd.isnull(windows_costs["Total invoiced (including VAT).3"])] +# Rename +windows_costs.columns = ["Model", "Total invoiced (including VAT)"] + +# Doors +doors_costs = costs[ + ['Model.4', 'Total invoiced (including VAT).4'] +].copy() +doors_costs["Model.4"] = "Doors - " + doors_costs["Model.4"] +doors_costs = doors_costs[~pd.isnull(doors_costs["Total invoiced (including VAT).4"])] +# Rename +doors_costs.columns = ["Model", "Total invoiced (including VAT)"] + +# ASHP +ashps_costs = costs[ + ['Model.5', 'Total invoiced (including VAT).5'] +].copy() +ashps_costs["Model.5"] = "ASHP - " + ashps_costs["Model.5"] +ashps_costs = ashps_costs[~pd.isnull(ashps_costs["Total invoiced (including VAT).5"])] +# Rename +ashps_costs.columns = ["Model", "Total invoiced (including VAT)"] + +# Solar +solar_costs = costs[ + ['Model.6', 'Total invoiced (including VAT).6'] +].copy() +solar_costs["Model.6"] = "Solar - " + solar_costs["Model.6"] +solar_costs = solar_costs[~pd.isnull(solar_costs["Total invoiced (including VAT).6"])] +# Rename +solar_costs.columns = ["Model", "Total invoiced (including VAT)"] + +fabric_costing_data = pd.concat([cwi_costs, li_costs]) +windows_doors_costing_data = pd.concat([windows_costs, doors_costs]) + +windows_doors_costing_data.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/windows_doors_costs.csv" +) +fabric_costing_data.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/fabric_costing_data.csv" +) +ashps_costs.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/ashps_costs.csv") +solar_costs.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/solar_costs.csv") + +project_cost_by_age = costs[["Property age ", "TOTAL Cost of Works"]].groupby("Property age ").mean().reset_index() diff --git a/etl/customers/panacap/assets.py b/etl/customers/panacap/assets.py new file mode 100644 index 00000000..ec57d9a4 --- /dev/null +++ b/etl/customers/panacap/assets.py @@ -0,0 +1,61 @@ +import os + +import pandas as pd +from dotenv import load_dotenv + +from etl.spatial.OpenUprnClient import OpenUprnClient +from etl.route_march_data_pull.app import get_data + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + +addresses = [ + {"address": "3 Willis Road", "postcode": "CB1 2AQ"}, + {"address": "22 Catharine Street", "postcode": "CB1 3AW"}, + {"address": "332 Mill Road", "postcode": "CB1 3NN"}, + {"address": "330 Mill Road", "postcode": "CB1 3NN"}, + {"address": "328 Mill Road", "postcode": "CB1 3NN"}, + {"address": "71 Mill Road", "postcode": "CB1 2AS"}, + {"address": "78 Argyle Street", "postcode": "CB1 3LZ"}, + {"address": "9 Graham Road", "postcode": "CB4 2ZE"}, + {"address": "217 Mill Road", "postcode": "CB1 3BE"}, + {"address": "374 Mill Road", "postcode": "CB1 3NN"}, + {"address": "174 Thoday Street", "postcode": "CB1 3AX"}, + {"address": "37 Abbey Road", "postcode": "CB5 8HH"}, + {"address": "18 Upper Gwydir Street", "postcode": "CB1 2LR"}, + {"address": "21 Fulbourn Road Fulbourn", "postcode": "CB1 9JL"}, + {"address": "108 Argyle Street", "postcode": "CB1 3LS"}, + {"address": "115 Victoria Road", "postcode": "CB4 3BS"}, + {"address": "55 Ross Street", "postcode": "CB1 3BP"}, + {"address": "16 Kingston Street", "postcode": "CB1 2NU"}, + {"address": "13 Thoday Street", "postcode": "CB1 3AS"}, + {"address": "103 York Street", "postcode": "CB1 2PZ"}, +] + +asset_list = pd.DataFrame(addresses) +asset_list["row_id"] = asset_list.index + +epc_data, _, _ = get_data( + asset_list=asset_list, fulladdress_column="address", postcode_column="postcode", address1_column="address", + manual_uprn_map={}, epc_api_only=True +) + +epc_df = pd.DataFrame(epc_data) +epc_df.shape + +asset_list = asset_list.merge( + epc_df, how="left", on="row_id" +) + +asset_list = asset_list.rename(columns={"address_x": "Address", "postcode_x": "Postcode"}) +asset_list["uprn"] = asset_list["uprn"].astype(str) + +spatial_data = OpenUprnClient.get_spatial_data([x["uprn"] for x in epc_data], bucket_name="retrofit-data-dev") +spatial_data["UPRN"] = spatial_data["UPRN"].astype(str) + +asset_list = asset_list.merge( + spatial_data, how="left", left_on="uprn", right_on="UPRN" +) + +asset_list.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Panacap/Acquisitions EPC Data.csv", + index=False) diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py index a0d01f7d..a8805a71 100644 --- a/etl/customers/remote_assessments/app.py +++ b/etl/customers/remote_assessments/app.py @@ -1,9 +1,15 @@ +import os import pandas as pd +from dotenv import load_dotenv from utils.s3 import save_csv_to_s3 +from etl.find_my_epc.AssetListEpcData import AssetListEpcData -PORTFOLIO_ID = 111 +PORTFOLIO_ID = 141 USER_ID = 8 +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + def app(): """ @@ -13,10 +19,21 @@ def app(): asset_list = [ { - "uprn": 100050770761, - "address": "12 Sheardown Street", - "postcode": "DN4 0BH" - } + "address": "196 Merrow Street", + "postcode": "SE17 2NP", + "uprn": 200003423454, + "patch": True + }, + { + "address": "65 Liverpool Grove", + "postcode": "SE17 2HP", + "uprn": 200003423194 + }, + { + "address": "2 Brettell Street", + "postcode": "SE17 2NZ", + "uprn": 200003423607 + }, ] asset_list = pd.DataFrame(asset_list) @@ -28,30 +45,46 @@ def app(): file_name=filename ) - non_invasive_recommendations = [ - { - "uprn": 100050770761, - "recommendations": [ - { - "type": "extension_cavity_wall_insulation", - "sap_points": 2, - } - ] - } - ] + # Pull the non-invasive recommendations automatically + asset_list_epc_client = AssetListEpcData( + asset_list=asset_list, + epc_auth_token=EPC_AUTH_TOKEN + ) + asset_list_epc_client.get_data() + asset_list_epc_client.get_non_invasive_recommendations() + asset_list_epc_client.get_patch() + # Store non-invasive recommendations in S3 non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv" save_csv_to_s3( - dataframe=pd.DataFrame(non_invasive_recommendations), + dataframe=pd.DataFrame(asset_list_epc_client.non_invasive_recommendations), bucket_name="retrofit-plan-inputs-dev", file_name=non_invasive_recommendations_filename ) + # Store patches in S3 + patches_filename = "" + if asset_list_epc_client.patches: + patches_filename = f"{USER_ID}/{PORTFOLIO_ID}/patches.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(asset_list_epc_client.patches), + bucket_name="retrofit-plan-inputs-dev", + file_name=patches_filename + ) + valuation_data = [ { - "uprn": 100050770761, - "value": 67_000 - } + "valuation": 339_000, + "uprn": 200003423454, + }, + { + "valuation": 374_000, + "uprn": 200003423194 + }, + { + "valuation": 719_000, + "uprn": 200003423607 + }, ] # Store valuation data to s3 valuation_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuation.csv" @@ -68,7 +101,7 @@ def app(): "goal_value": "C", "trigger_file_path": filename, "already_installed_file_path": "", - "patches_file_path": "", + "patches_file_path": patches_filename, "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, "valuation_file_path": valuation_filename, "scenario_name": "Full package remote assessment", diff --git a/etl/customers/settle/route_march_2024_11_08.py b/etl/customers/settle/route_march_2024_11_08.py new file mode 100644 index 00000000..21b6f2df --- /dev/null +++ b/etl/customers/settle/route_march_2024_11_08.py @@ -0,0 +1,226 @@ +import os +import time + +import pandas as pd +from tqdm import tqdm + +from dotenv import load_dotenv +from utils.s3 import read_excel_from_s3 +from backend.SearchEpc import SearchEpc +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes + +from recommendations.recommendation_utils import ( + estimate_perimeter, + estimate_external_wall_area, + estimate_number_of_floors +) + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def get_data(asset_list): + epc_data = [] + errors = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + try: + postcode = home["Postcode"] + house_number = home["AddressLine1"] + full_address = ", ".join([home["AddressLine1"], home["AddressLine4"], home["AddressLine5"]]) + + searcher = SearchEpc( + address1=str(house_number), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + epc = { + "row_id": home["row_id"], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"] + } + + epc_data.append(epc) + except Exception as e: + errors.append(home["row_id"]) + time.sleep(5) + + return epc_data, errors + + +def app(): + """ + This app is EPC pulling data for some properties owned by Livewest + + Data request contents: + Date of last EPC + Reason for EPC + SAP score on register + Property Type + Property Area + Property Age + Any Dimensions (HLP,PW,RH) + Property Wall Construction + Heating Type + Secondary Heating + Loft Insulation Depth + + Additional if possible: + Heat loss calculations + EPC recommendations + Property UPRN + + """ + asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Settle/SETTLE FULL PROPOSED PROGRAMME.xlsx", + header=0 + ) + asset_list["row_id"] = asset_list.index + + epc_data, errors = get_data(asset_list) + + # We now retrieve any failed properties + asset_list_failed = asset_list[asset_list["row_id"].isin(errors)] + epc_data_failed, _ = get_data(asset_list_failed) + + # Append the failed data to the main data + epc_data.extend(epc_data_failed) + + epc_df = pd.DataFrame(epc_data) + + # We expand out the recommendations + recommendations_df = epc_df[["row_id", "recommendations"]] + + unique_recommendations = set() + for _, row in recommendations_df.iterrows(): + unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) + + columns = ["row_id"] + list(unique_recommendations) + transformed_data = [] + for _, row in recommendations_df.iterrows(): + # Initialize a dictionary for this row with False for all recommendations + row_data = {col: False for col in columns} + row_data["row_id"] = row["row_id"] + + # Set True for each recommendation present in this row + for rec in row["recommendations"]: + recommendation_text = rec["improvement-summary-text"] + row_data[recommendation_text] = True + + # Append the row data to transformed_data + transformed_data.append(row_data) + + transformed_df = pd.DataFrame(transformed_data) + # Drop the column that is "" + transformed_df = transformed_df.drop(columns=[""]) + + # Retrieve just the data we need + epc_df = epc_df[ + [ + "row_id", + "uprn", + "property-type", + "built-form", + "inspection-date", + "current-energy-rating", + "current-energy-efficiency", + "roof-description", + "walls-description", + "transaction-type", + # New fields needed + "secondheat-description", + "total-floor-area", + "construction-age-band", + "floor-height", + "number-habitable-rooms", + "mainheat-description", + # + "energy-consumption-current", # kwh/m2 + ] + ] + + asset_list = asset_list.merge( + epc_df, + how="left", + on="row_id" + ).merge( + transformed_df, + how="left", + on="row_id" + ) + + asset_list = asset_list.drop(columns=["row_id"]) + + # Rename the columns + asset_list = asset_list.rename(columns={ + "inspection-date": "Date of last EPC", + "current-energy-efficiency": "SAP score on register", + "current-energy-rating": "EPC rating on register", + "property-type": "Property Type", + "built-form": "Archetype", + "total-floor-area": "Property Floor Area", + "construction-age-band": "Property Age Band", + "floor-height": "Property Floor Height", + "number-habitable-rooms": "Number of Habitable Rooms", + "walls-description": "Wall Construction", + "roof-description": "Roof Construction", + "mainheat-description": "Heating Type", + "secondheat-description": "Secondary Heating", + "transaction-type": "Reason for last EPC", + "energy-consumption-current": "Heat Demand (kWh/m2)" + }) + + asset_list["Estimated Number of Floors"] = asset_list.apply( + lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull( + x["Property Type"]) else None, axis=1 + ) + + asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float) + # Replace "" value with None + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None) + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float) + + asset_list["Estimated Perimeter (m)"] = asset_list.apply( + lambda x: estimate_perimeter( + floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"], + num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"], + ), axis=1 + ) + + asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply( + lambda x: estimate_external_wall_area( + num_floors=x["Estimated Number of Floors"], + floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, + perimeter=x["Estimated Perimeter (m)"], + built_form=x["Archetype"] + ), + axis=1 + ) + + asset_list["Roof Insulation Thickness"] = asset_list.apply( + lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull( + x["Roof Construction"]) else None, + axis=1 + ) + + # Store as an excel + filename = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Settle/Settle EPC Data pull - 08 Nov 2024.xlsx" + asset_list.to_excel(filename, index=False) diff --git a/etl/customers/southend/epc_data_pull_2024_11_14.py b/etl/customers/southend/epc_data_pull_2024_11_14.py new file mode 100644 index 00000000..11ddcc6f --- /dev/null +++ b/etl/customers/southend/epc_data_pull_2024_11_14.py @@ -0,0 +1,231 @@ +import os +import time + +import pandas as pd +from tqdm import tqdm + +from dotenv import load_dotenv +from utils.s3 import read_excel_from_s3 +from backend.SearchEpc import SearchEpc +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes + +from recommendations.recommendation_utils import ( + estimate_perimeter, + estimate_external_wall_area, + estimate_number_of_floors +) + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def get_data(asset_list): + epc_data = [] + errors = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + try: + postcode = home["Postcode"] + address1 = home["address1"].split(",")[0] + full_address = home["Address"] + + searcher = SearchEpc( + address1=str(address1), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + epc = { + "row_id": home["row_id"], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"] + } + + epc_data.append(epc) + except Exception as e: + errors.append(home["row_id"]) + time.sleep(5) + + return epc_data, errors + + +def app(): + """ + This app is EPC pulling data for some properties owned by Livewest + + Data request contents: + Date of last EPC + Reason for EPC + SAP score on register + Property Type + Property Area + Property Age + Any Dimensions (HLP,PW,RH) + Property Wall Construction + Heating Type + Secondary Heating + Loft Insulation Depth + + Additional if possible: + Heat loss calculations + EPC recommendations + Property UPRN + + """ + asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/southend/Southend Planned programme.xlsx", + header=0, + sheet_name="Planned RM" + ) + asset_list["row_id"] = asset_list.index + asset_list["address1"] = asset_list["Address"].str.split(",").str[0] + + epc_data, errors = get_data(asset_list) + + # We now retrieve any failed properties + asset_list_failed = asset_list[asset_list["row_id"].isin(errors)] + epc_data_failed, _ = get_data(asset_list_failed) + + # Append the failed data to the main data + epc_data.extend(epc_data_failed) + + epc_df = pd.DataFrame(epc_data) + + # We expand out the recommendations + recommendations_df = epc_df[["row_id", "recommendations"]] + + unique_recommendations = set() + for _, row in recommendations_df.iterrows(): + unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]]) + + columns = ["row_id"] + list(unique_recommendations) + transformed_data = [] + for _, row in recommendations_df.iterrows(): + # Initialize a dictionary for this row with False for all recommendations + row_data = {col: False for col in columns} + row_data["row_id"] = row["row_id"] + + # Set True for each recommendation present in this row + for rec in row["recommendations"]: + recommendation_text = rec["improvement-summary-text"] + row_data[recommendation_text] = True + + # Append the row data to transformed_data + transformed_data.append(row_data) + + transformed_df = pd.DataFrame(transformed_data) + # Drop the column that is "" + transformed_df = transformed_df.drop(columns=[""]) + + # Retrieve just the data we need + epc_df = epc_df[ + [ + "row_id", + "uprn", + "property-type", + "built-form", + "inspection-date", + "current-energy-rating", + "current-energy-efficiency", + "roof-description", + "walls-description", + "transaction-type", + # New fields needed + "secondheat-description", + "total-floor-area", + "construction-age-band", + "floor-height", + "number-habitable-rooms", + "mainheat-description", + # + "energy-consumption-current", # kwh/m2 + "photo-supply", + ] + ] + + asset_list = asset_list.merge( + epc_df, + how="left", + on="row_id" + ).merge( + transformed_df, + how="left", + on="row_id" + ) + + asset_list = asset_list.drop(columns=["row_id"]) + + # Rename the columns + asset_list = asset_list.rename(columns={ + "inspection-date": "Date of last EPC", + "current-energy-efficiency": "SAP score on register", + "current-energy-rating": "EPC rating on register", + "property-type": "Property Type", + "built-form": "Archetype", + "total-floor-area": "Property Floor Area", + "construction-age-band": "Property Age Band", + "floor-height": "Property Floor Height", + "number-habitable-rooms": "Number of Habitable Rooms", + "walls-description": "Wall Construction", + "roof-description": "Roof Construction", + "mainheat-description": "Heating Type", + "secondheat-description": "Secondary Heating", + "transaction-type": "Reason for last EPC", + "energy-consumption-current": "Heat Demand (kWh/m2)", + "photo-supply": "% of the Roof with PV" + }) + + asset_list["Estimated Number of Floors"] = asset_list.apply( + lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull( + x["Property Type"]) else None, axis=1 + ) + + asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float) + # Replace "" value with None + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None) + asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float) + + asset_list["Estimated Perimeter (m)"] = asset_list.apply( + lambda x: estimate_perimeter( + floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"], + num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"], + ), axis=1 + ) + + asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply( + lambda x: estimate_external_wall_area( + num_floors=x["Estimated Number of Floors"], + floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5, + perimeter=x["Estimated Perimeter (m)"], + built_form=x["Archetype"] + ), + axis=1 + ) + + asset_list["Roof Insulation Thickness"] = asset_list.apply( + lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull( + x["Roof Construction"]) else None, + axis=1 + ) + + # Store as an excel + filename = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/southend/southend EPC Data pull - 14 Nov " + "2024.xlsx") + asset_list.to_excel(filename, index=False) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py new file mode 100644 index 00000000..95fe4fcd --- /dev/null +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -0,0 +1,4293 @@ +import os +from urllib import parse +from fuzzywuzzy import fuzz + +import PyPDF2 +import re +import pandas as pd +import numpy as np +from tqdm import tqdm +from collections import Counter +from scipy.optimize import linprog + +from SearchEpc import SearchEpc +from utils.s3 import read_pickle_from_s3 + +CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" +SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}") +NUM_FOLDERS = 15 + + +def sap_to_epc(sap_points: int | float): + """ + Simple utility function to convert SAP points to EPC rating. + :param sap_points: numerical value of SAP points, typically between 0 and 100 + :return: + """ + + if sap_points <= 0: + raise ValueError("SAP points should be above 0.") + + if sap_points >= 92: + return "A" + elif sap_points >= 81: + return "B" + elif sap_points >= 69: + return "C" + elif sap_points >= 55: + return "D" + elif sap_points >= 39: + return "E" + elif sap_points >= 21: + return "F" + else: + return "G" + + +def extract_wall_details_summary(text): + """ + Extracts wall type, insulation, dry-lining, and thickness for each building part, + including any alternative wall details within the 7.0 Walls section of the summary PDF text. + """ + # Define data structure to hold all building part wall entries + wall_data = [] + + # Locate the entire 7.0 Walls section + wall_section = re.search(r"7\.0 Walls:\n(.*?)\n8\.0 Roofs:", text, re.DOTALL).group(1) + + # Define pattern to match each building part's wall entry within the section + building_part_pattern = re.compile( + r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label + r"Type\s+(.*?)\n" # Matches main wall Type + r"Insulation\s+(.*?)\n" # Matches main wall Insulation + r"(Dry-lining\s+(.*?)\n)?" # Optional main wall Dry-lining + r"Wall Thickness Unknown\s+(.*?)\n" # Matches main wall Thickness Unknown + r"Wall Thickness \[mm\]\s+(\d+)", # Matches main wall Thickness + re.DOTALL + ) + + # Define pattern to capture alternative wall details, if present + alternative_wall_pattern = re.compile( + r"Alternative Wall Area.*?\n" # Matches start of alternative wall section + r"Alternative Type\s+(.*?)\n" # Matches alternative wall Type + r"Alternative Insulation\s+(.*?)\n" # Matches alternative wall Insulation + r"(Alternative Dry-lining\s+(.*?)\n)?" # Optional Alternative Dry-lining + r"Alternative Wall Thickness Unknown\s+(.*?)\n" # Matches alternative wall Thickness Unknown + r"Alternative Wall Thickness\s+(\d+)", # Matches alternative wall Thickness + re.DOTALL + ) + + # Find all building part entries within the 7.0 Walls section + for match in building_part_pattern.finditer(wall_section): + wall_label = match.group(1).strip() + main_wall_type = match.group(2).strip() + main_wall_insulation = match.group(3).strip() + main_wall_dry_lining = match.group(5).strip() if match.group(5) else "N/A" + main_wall_thickness_unknown = match.group(6).strip() + main_wall_thickness = int(match.group(7)) + + # Initialize dictionary for this wall entry + wall_entry = { + "Building Part": wall_label, + "Wall Type": main_wall_type, + "Wall Insulation": main_wall_insulation, + "Wall Dry-lining": main_wall_dry_lining, + "Wall Thickness Unknown": main_wall_thickness_unknown, + "Wall Thickness (mm)": main_wall_thickness, + "Alternative Wall Type": None, + "Alternative Wall Insulation": None, + "Alternative Wall Dry-lining": "N/A", + "Alternative Wall Thickness Unknown": None, + "Alternative Wall Thickness (mm)": None, + } + + # Check if there's an alternative wall section following this wall entry + alt_match = alternative_wall_pattern.search(wall_section, match.end()) + if alt_match: + wall_entry["Alternative Wall Type"] = alt_match.group(1).strip() + wall_entry["Alternative Wall Insulation"] = alt_match.group(2).strip() + wall_entry["Alternative Wall Dry-lining"] = alt_match.group(4).strip() if alt_match.group(4) else "N/A" + wall_entry["Alternative Wall Thickness Unknown"] = alt_match.group(5).strip() + wall_entry["Alternative Wall Thickness (mm)"] = int(alt_match.group(6)) + + # Append each building part as a dictionary in the wall_data list + wall_data.append(wall_entry) + + return wall_data + + +def extract_summary_report(pdf_path): + """ + Extracts specific data from the provided PDF file. + Data includes: + - Current SAP rating + - Fuel Bill + - Address + """ + + data = { + "Address": None, + "Postcode": None, + "Current SAP Rating": None, + "Current EPC Band": None, + "Fuel Bill": None, + "Main Building Age Band": None, + "Number of Storeys": None, + "Window Age Description": None, + "Window Age Description Proportion (%)": None, + "Secondary Window Age Description": None, + "Secondary Window Age Description Proportion (%)": None, + "Number of Windows": None, + "Total Number of Doors": None, + "Number of Insulated Doors": None, + "Existing Primary Heating System": None, + "Existing Primary Heating PCDF Reference": None, + "Existing Primary Heating Controls": None, + "Existing Primary Heating % of Heat": None, + "Existing Secondary Heating System": None, + "Existing Secondary Heating PCDF Reference": None, + "Existing Secondary Heating Controls": None, + "Existing Secondary Heating % of Heat": None, + "Secondary Heating Code": None, + "Water Heating Code": None, + 'Total Floor Area (m2)': None, + 'Total Ground Floor Area (m2)': None, + 'RIR Floor Area': None, + 'Main Building Wall Area (m2)': None, + 'First Extension Wall Area (m2)': None, + "Number of Light Fittings": None, + "Number of LEL Fittings": None, + "Number of fittings needing LEL": None, + "Main Roof Type": None, + "Main Roof Insulation": None, + "Main Roof Insulation Thickness": None, + "Main Wall Type": None, + "Main Wall Insulation": None, + "Main Wall Dry-lining": None, + "Main Wall Thickness": None, + "Main Building Alternative Wall Type": None, + "Main Building Alternative Wall Insulation": None, + "Main Building Alternative Wall Dry-lining": None, + "Main Building Alternative Wall Thickness": None, + } + + with (open(pdf_path, "rb") as file): + reader = PyPDF2.PdfReader(file) + text = "" + for page in reader.pages: + text += page.extract_text() + + # Extract Current SAP rating + sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) + data["Current SAP Rating"] = sap_match.group(1).split(" ")[1] + + # Extract age + age_band_match = re.search( + r"3\.0 Date Built:\s*Main Property\s*[A-Z]?\s*(\d{4}-\d{4}|before \d{4}|\d{4} onwards)", + text + ) + data["Main Building Age Band"] = age_band_match.group(1) + + # Number of storeys + storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) + data["Number of Storeys"] = int(storeys_match.group(1)) + + # Extract Carbon Emissions + # carbon_match = re.search(r"Emissions \(t/year\):\s*([\d.]+)\s*tonnes", text) + # data["Carbon Emissions (t/year)"] = float(carbon_match.group(1)) + + # Extract Fuel Bill + fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text) + data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" + + # Extract individual address components + postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text) + # region = re.search(r"Region:\s*(.*?)\nHouse Name:", text) + house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text) + house_no = re.search(r"House No:\s*(.*?)\nStreet:", text) + street = re.search(r"Street:\s*(.*?)\nLocality:", text) + locality = re.search(r"Locality:\s*(.*?)\nTown:", text) + town = re.search(r"Town:\s*(.*?)\nCounty:", text) + county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text) + + # Clean extracted values and remove any prefixes + address_parts = [ + house_no.group(1).strip() if house_no else "", + house_name.group(1).strip() if house_name else "", + street.group(1).strip() if street else "", + locality.group(1).strip() if locality else "", + town.group(1).strip() if town else "", + county.group(1).strip() if county else "", + postcode.group(1).strip() if postcode else "" + ] + + # Join non-empty parts with a comma + data["Address"] = ", ".join([part for part in address_parts if part]) + data["Postcode"] = postcode.group(1).strip() + + windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) + windows_text = windows_section.group(1) + window_data = extract_window_age_description(windows_text) + data.update(window_data) + + # Extract Total Number of Doors + total_doors_match = re.search(r"Total Number of Doors\s*(\d+)", text) + data["Total Number of Doors"] = int(total_doors_match.group(1)) + + # Extract Number of Insulated Doors + insulated_doors_match = re.search(r"Number of Insulated Doors\s*(\d+)", text) + data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) + + # Extract heating system + # Extract Primary Heating Data + # Extract Primary Heating Section + primary_heating_section1 = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL) + primary_heating_section2 = re.search(r"Main\s*Heating1\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) + primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 + + primary_text = primary_heating_section.group(1) + + data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group( + 1).strip() + data["Existing Primary Heating PCDF Reference"] = re.search( + r"PCDF boiler Reference\s*(\d+)", primary_text + ).group(1) + data["Existing Primary Heating Controls"] = re.search( + r"Main Heating Controls\s*(.*?)\n", primary_text + ).group(1).strip() + data["Existing Primary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1) + ) + + # Extract Secondary Heating Section + secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) + + if secondary_heating_section is None: + data["Existing Secondary Heating System"] = "" + data["Existing Secondary Heating PCDF Reference"] = "" + data["Existing Secondary Heating Controls"] = "" + data["Existing Secondary Heating % of Heat"] = 0 + + else: + secondary_text = secondary_heating_section.group(1) + + main_heating_code_match_secondary = re.search( + r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text + ) + data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip() + data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", + secondary_text).group(1) + second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) + data["Existing Secondary Heating Controls"] = ( + second_heating_controls_match.group(1).strip() if second_heating_controls_match else "" + ) + data["Existing Secondary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) + ) + + # Extract Secondary Heating and Water Heating Codes + secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) + water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) + + if data["Existing Secondary Heating System"] == "": + data["Secondary Heating Code"] = "" + else: + data["Secondary Heating Code"] = secondary_heating_code_match.group( + 1).strip() if secondary_heating_code_match else "" + + data["Water Heating Code"] = water_heating_code_match.group(1).strip() + + dimensions = extract_building_parts_summary(text) + data.update(dimensions) + + data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1)) + data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1)) + data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + + extracted_roof_data = extract_roof_details_summary(text) + main_roof_data = [roof for roof in extracted_roof_data if "Main" in roof["Building Part"]][0] + data["Main Roof Type"] = main_roof_data["Roof Type"] + data["Main Roof Insulation"] = main_roof_data["Roof Insulation"] + data["Main Roof Insulation Thickness"] = main_roof_data["Roof Insulation Thickness"] + + walls_data = extract_wall_details_summary(text) + # Get the main building wall data + main_building_walls = [wall for wall in walls_data if "Main" in wall["Building Part"]][0] + data["Main Wall Type"] = main_building_walls["Wall Type"] + data["Main Wall Insulation"] = main_building_walls["Wall Insulation"] + data["Main Wall Dry-lining"] = main_building_walls["Wall Dry-lining"] + data["Main Wall Thickness"] = main_building_walls["Wall Thickness (mm)"] + data["Main Building Alternative Wall Type"] = main_building_walls["Alternative Wall Type"] + data["Main Building Alternative Wall Insulation"] = main_building_walls["Alternative Wall Insulation"] + data["Main Building Alternative Wall Dry-lining"] = main_building_walls["Alternative Wall Dry-lining"] + data["Main Building Alternative Wall Thickness"] = main_building_walls["Alternative Wall Thickness (mm)"] + + return data + + +def extract_window_age_description(windows_text): + """ + Extracts the most common window age description and its proportion. + + Parameters: + windows_text (str): The text section containing window data. + + Returns: + dict: A dictionary with the most common window age description and its proportion. + """ + # Clean up windows_text by removing line breaks for better pattern matching + windows_text = windows_text.replace("\n", "") + + # Define possible window age descriptions + window_descriptions = [ + "Double post or during 2002", + "Double pre 2002", + "Double with unknown install date", + "Secondary glazing", + "Triple glazing", + "Single glazing", + ] + + # Count occurrences of each description + description_counts = Counter() + for description in window_descriptions: + matches = re.findall(re.escape(description), windows_text) + description_counts[description] = len(matches) + + if not description_counts or not sum(description_counts.values()): + raise ValueError("Failed to extract window data.") + + # Determine the most common description and calculate its proportion + most_common_description, window_count = description_counts.most_common(1)[0] + window_proportion = window_count / sum(description_counts.values()) * 100 + + # Get the second most common and the proportion + if window_proportion == 100: + second_most_common_description = None + second_most_common_proportion = 0 + else: + second_most_common_description, second_window_count = description_counts.most_common(2)[1] + second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100 + + return { + "Window Age Description": most_common_description, + "Window Age Description Proportion (%)": window_proportion, + "Secondary Window Age Description": second_most_common_description, + "Secondary Window Age Description Proportion (%)": second_most_common_proportion, + "Number of Windows": sum(description_counts.values()) + } + + +def extract_building_parts_epr(text): + """ + Extracts building parts and associated dimensions from the provided PDF text. + Each building part (main and extensions) includes floor area, room height, perimeter, and party wall length. + Handles cases where 'Room(s) in Roof area' appears within the part_name with only the Floor Area information. + """ + data = [] + + # Pattern to locate each "Building part" section + building_part_pattern = re.compile( + r"Construction details: Building part: (.*?)\nFloor Area \[m2\] Room Height \[m\] Perimeter \[m\] Party " + r"Wall Length \[m\]\n(.*?)(?=Construction details|Data inputs|$)", + re.DOTALL + ) + + # Extract each building part + for match in building_part_pattern.finditer(text): + part_name = match.group(1).strip() + floor_data = match.group(2) + + # Check for "Room(s) in Roof area" within the part_name + room_in_roof_match = re.search(r"Room\(s\) in Roof area:\s*([\d.]+)", part_name) + if room_in_roof_match: + # Extract Room in Roof area and add it as a separate entry + floor_area = float(room_in_roof_match.group(1)) + # Clean up part name to exclude "Room(s) in Roof area" from the building part name + cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() + data.append({ + "Building Part": cleaned_part_name, + "Floor Level": "Room in Roof", + "Floor Area (m2)": floor_area, + "Room Height (m)": None, # Placeholder for missing data + "Perimeter (m)": None, # Placeholder for missing data + "Party Wall Length (m)": None # Placeholder for missing data + }) + else: + # Clean up part name to keep only the descriptor (e.g., "Main" or "1st Extension") + cleaned_part_name = re.sub(r" - built in.*", "", part_name).strip() + + # Pattern to match each floor's measurements in standard cases + floor_pattern = re.compile( + r"(Lowest floor|First floor|Second floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + ) + + # Extract floor details for each building part + for floor_match in floor_pattern.finditer(floor_data): + floor_level = floor_match.group(1) + floor_area = float(floor_match.group(2)) + room_height = float(floor_match.group(3)) + perimeter = float(floor_match.group(4)) + party_wall_length = float(floor_match.group(5)) + + # Append to data + data.append({ + "Building Part": cleaned_part_name, + "Floor Level": floor_level, + "Floor Area (m2)": floor_area, + "Room Height (m)": room_height, + "Perimeter (m)": perimeter, + "Party Wall Length (m)": party_wall_length + }) + + # Aggregated data calculation + main_building = [part for part in data if "Main" in part["Building Part"]] + first_extension = [part for part in data if "1st Extension" in part["Building Part"]] + dimensions = { + "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]), + "Total Ground Floor Area (m2)": sum( + [part["Floor Area (m2)"] for part in data if "Lowest floor" in part["Floor Level"]] + ), + "RIR Floor Area": sum( + [part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]] + ), + "Main Building Wall Area (m2)": sum( + [x["Perimeter (m)"] * x["Room Height (m)"] for x in main_building if + x["Perimeter (m)"] and x["Room Height (m)"]] + ), + "First Extension Wall Area (m2)": sum( + [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extension if + x["Perimeter (m)"] and x["Room Height (m)"]] + ) if first_extension else 0, + } + + return dimensions + + +def extract_building_parts_summary(text): + """ + Extracts building parts and associated dimensions from the summary report PDF. + This includes Main Property, multiple extensions if they exist, and Room in Roof areas. + """ + data = [] + + # Locate the Dimensions section + dimensions_section = re.search( + r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL + ) + if not dimensions_section: + dimensions_section = re.search( + r"Dimensions:\s*Dimension type: External\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL + ) + if not dimensions_section: + raise ValueError("Failed to locate dimensions section in the text.") + + dimensions_text = dimensions_section.group(1) + + # Pattern to extract each building part, starting from Main Property and including extensions + building_part_pattern = re.compile( + r"(Main Property|\d+(?:st|nd|rd|th) Extension)\s*" + r"(.*?)(?=\d+(?:st|nd|rd|th) Extension|5\.0 Conservatory|$)", + re.DOTALL + ) + + # Loop through each building part match, including Main Property and extensions + for match in building_part_pattern.finditer(dimensions_text): + part_name = match.group(1) + floor_data = match.group(2) + + # Pattern to extract floor details: Floor Level, Floor Area, Room Height, Perimeter, Party Wall Length + floor_pattern = re.compile( + r"(1st Floor|Lowest Floor|Second floor):\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + ) + + # Extract data for each floor within the building part + for floor_match in floor_pattern.finditer(floor_data): + floor_level = floor_match.group(1) + floor_area = float(floor_match.group(2)) + room_height = float(floor_match.group(3)) + perimeter = float(floor_match.group(4)) + party_wall_length = float(floor_match.group(5)) + + # Append to data list + data.append({ + "Building Part": part_name, + "Floor Level": floor_level, + "Floor Area (m2)": floor_area, + "Room Height (m)": room_height, + "Perimeter (m)": perimeter, + "Party Wall Length (m)": party_wall_length + }) + + # Check specifically for "Room(s) in Roof" entries, which only have Floor Area + room_in_roof_pattern = re.compile(r"Room\(s\) in Roof:\s*([\d.]+)") + room_in_roof_match = room_in_roof_pattern.search(floor_data) + if room_in_roof_match: + floor_area = float(room_in_roof_match.group(1)) + data.append({ + "Building Part": part_name, + "Floor Level": "Room in Roof", + "Floor Area (m2)": floor_area, + "Room Height (m)": None, # Placeholder for missing data + "Perimeter (m)": None, # Placeholder for missing data + "Party Wall Length (m)": None # Placeholder for missing data + }) + + # Calculate aggregated dimensions + main_property = [part for part in data if "Main Property" in part["Building Part"]] + first_extensions = [part for part in data if "1st Extension" in part["Building Part"]] + dimensions = { + "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]), + "Total Ground Floor Area (m2)": sum( + [part["Floor Area (m2)"] for part in data if "Lowest Floor" in part["Floor Level"]] + ), + "RIR Floor Area": sum( + [part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]] + ), + "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property if + x["Perimeter (m)"] and x["Room Height (m)"]]), + "First Extension Wall Area (m2)": sum( + [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions if + x["Perimeter (m)"] and x["Room Height (m)"]] + ), + } + + return dimensions + + +def extract_roof_details_epr(text): + """ + Extracts roof type, insulation, and insulation thickness for each building part + in the provided EPR PDF text. + """ + # Define data structure to hold results + roof_data = [] + + # Locate each building part section + building_part_pattern = re.compile( + r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)", + re.DOTALL + ) + + # Extract each building part's data, including roof details + for match in building_part_pattern.finditer(text): + part_name = match.group(1).strip() + + # Clean up the building part name + cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() + + part_details = match.group(2) + + # Extract Roof Type, Roof Insulation, and Roof Insulation Thickness + roof_type_match = re.search(r"Roof Type:\s*(.*?)(?=\n|$)", part_details) + roof_insulation_match = re.search(r"Roof Insulation:\s*(.*?)(?=\n|$)", part_details) + roof_insulation_thickness_match = re.search(r"Roof Insulation Thickness:\s*(.*?)(?=\n|$)", part_details) + + # Store results for this building part + roof_data.append({ + "Building Part": cleaned_part_name, + "Roof Type": roof_type_match.group(1).strip() if roof_type_match else None, + "Roof Insulation": roof_insulation_match.group(1).strip() if roof_insulation_match else None, + "Roof Insulation Thickness": roof_insulation_thickness_match.group( + 1).strip() if roof_insulation_thickness_match else None, + }) + + return roof_data + + +def extract_roof_details_summary(text): + """ + Extracts roof type, insulation, and insulation thickness for each building part + in the 8.0 Roofs section of the summary report. + """ + # Define data structure to hold results + roof_data = [] + + # Locate the entire 8.0 Roofs section + roof_section_match = re.search(r"8\.0 Roofs:\n(.*?)(?=\n9\.0 Floors:|$)", text, re.DOTALL) + if not roof_section_match: + return roof_data # Return empty if no roof section is found + + # Extract the roof section and append "9.0 Floors:" as the boundary + roof_section = roof_section_match.group(1).strip() + "\n9.0 Floors:" + + # Define pattern to match each building part's roof entry + building_part_pattern = re.compile( + r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label + r"Type\s+(.*?)(?=\n(?:Insulation|9\.0 Floors:|[A-Z]))" # Matches Roof Type until the next field, label, or end + r"(?:\nInsulation\s+(.*?)(?=\n(?:Insulation Thickness|9\.0 Floors:|[A-Z])))?" # Optional Insulation + r"(?:\nInsulation Thickness\s+(.*?)(?=\n(?:9\.0 Floors:|[A-Z])))?", # Optional Insulation Thickness + re.DOTALL + ) + + # Extract each building part's data + for match in building_part_pattern.finditer(roof_section): + part_name = match.group(1).strip() # Building part label + roof_type = match.group(2).strip() # Roof Type + roof_insulation = match.group(3).strip() if match.group(3) else None # Optional Insulation + roof_insulation_thickness = match.group(4).strip() if match.group(4) else None # Optional Thickness + + # Cleaning to handle annoying cases when it comes out like this: + # 'A Another dwelling above\n1st Extension' + if roof_type.startswith("A Another dwelling above"): + roof_type = "A Another dwelling above" + + # Store results for this building part + roof_data.append({ + "Building Part": part_name, + "Roof Type": roof_type, + "Roof Insulation": roof_insulation, + "Roof Insulation Thickness": roof_insulation_thickness, + }) + + return roof_data + + +def extract_wall_details_epr(text): + """ + Extracts wall type, insulation, dry-lining, and thickness for each building part + in the provided EPR PDF text. + """ + # Define data structure to hold results + wall_data = [] + + # Locate each building part section + building_part_pattern = re.compile( + r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)", + re.DOTALL + ) + + # Extract each building part's data, including wall details + for match in building_part_pattern.finditer(text): + part_name = match.group(1).strip() + + # Clean up the building part name + cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() + + part_details = match.group(2) + + # Extract Wall Type, Wall Insulation, Wall Dry-lining, and Wall Thickness + wall_type_match = re.search(r"Wall Type:\s*(.*?)(?=\n|$)", part_details) + wall_insulation_match = re.search(r"Wall Insulation:\s*(.*?)(?=\n|$)", part_details) + wall_drylining_match = re.search(r"Wall Dry-lining:\s*(.*?)(?=\n|$)", part_details) + wall_thickness_match = re.search(r"Wall Thickness:\s*(\d+)(?=\n|$)", part_details) + + # Extract Alternative Wall information if available + alt_wall_type_match = re.search(r"Alternative Wall Type:\s*(.*?)(?=\n|$)", part_details) + alt_wall_insulation_match = re.search(r"Alternative Wall Insulation:\s*(.*?)(?=\n|$)", part_details) + alt_wall_drylining_match = re.search(r"Alternative Wall Dry-lining:\s*(.*?)(?=\n|$)", part_details) + alt_wall_thickness_match = re.search(r"Alternative Wall Thickness:\s*(\d+)(?=\n|$)", part_details) + + # Store results for this building part + wall_data.append({ + "Building Part": cleaned_part_name, + "Wall Type": wall_type_match.group(1).strip() if wall_type_match else None, + "Wall Insulation": wall_insulation_match.group(1).strip() if wall_insulation_match else None, + "Wall Dry-lining": wall_drylining_match.group(1).strip() if wall_drylining_match else None, + "Wall Thickness": int(wall_thickness_match.group(1)) if wall_thickness_match else None, + "Alternative Wall Type": alt_wall_type_match.group(1).strip() if alt_wall_type_match else None, + "Alternative Wall Insulation": alt_wall_insulation_match.group( + 1).strip() if alt_wall_insulation_match else None, + "Alternative Wall Dry-lining": alt_wall_drylining_match.group( + 1).strip() if alt_wall_drylining_match else None, + "Alternative Wall Thickness": int(alt_wall_thickness_match.group(1)) if alt_wall_thickness_match else None, + }) + + return wall_data + + +def extract_epr(pdf_path): + """ + Extracts specific data from an Energy Report (EPR) PDF file. + """ + + data = { + "Address": None, + "Postcode": None, + "Current SAP Rating": None, + "Current EPC Band": None, + "Primary Energy Use (kWh/yr)": None, + "Primary Energy Use Intensity (kWh/m2/yr)": None, + "Number of Storeys": None, + "Main Building Age Band": None, + "Fuel Bill": None, + "Window Age Description": None, + "Window Age Description Proportion (%)": None, + "Secondary Window Age Description": None, + "Secondary Window Age Description Proportion (%)": None, + "Number of Windows": None, + "Total Number of Doors": None, + "Number of Insulated Doors": None, + "Existing Primary Heating System": None, + "Existing Primary Heating PCDF Reference": None, + "Existing Primary Heating Controls": None, + "Existing Primary Heating % of Heat": None, + "Existing Secondary Heating System": None, + "Existing Secondary Heating PCDF Reference": None, + "Existing Secondary Heating Controls": None, + "Existing Secondary Heating % of Heat": None, + "Secondary Heating Code": None, + "Water Heating Code": None, + 'Total Floor Area (m2)': None, + 'Total Ground Floor Area (m2)': None, + 'RIR Floor Area': None, + 'Main Building Wall Area (m2)': None, + 'First Extension Wall Area (m2)': None, + "Number of Light Fittings": None, + "Number of LEL Fittings": None, + "Number of fittings needing LEL": None, + "Main Roof Type": None, + "Main Roof Insulation": None, + "Main Roof Insulation Thickness": None, + "Main Wall Type": None, + "Main Wall Insulation": None, + "Main Wall Dry-lining": None, + "Main Wall Thickness": None, + "Main Building Alternative Wall Type": None, + "Main Building Alternative Wall Insulation": None, + "Main Building Alternative Wall Dry-lining": None, + "Main Building Alternative Wall Thickness": None, + "Main Fuel": None + } + + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + text = "" + for page in reader.pages: + text += page.extract_text() + + # Extract Address + address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL) + data["Address"] = address_match.group(1).strip() + data["Postcode"] = data["Address"].split(",")[-1].strip() + + # Extract Current and Potential SAP ratings + sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text) + if sap_match is None: + # Handles the older format of the elmhurst EPR + # The text will look something like this: + # Least energy efficient - higher running costsD 61 - we extract D 61 + sap_match = re.search( + r"(?P[A-G])\s(?P\d{1,3})(?P[A-G])\s(?P\d{1,3})", + text) + data["Current EPC Band"] = sap_match.group("current_epc") + data["Current SAP Rating"] = int(sap_match.group("current_sap")) + else: + current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2)) + data["Current SAP Rating"] = current_sap + + # Extract the primary energy use intensity + additional_rating_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text) + if additional_rating_match: + data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1)) + else: + # Handles the older format of the Elmhurst EPR + primary_energy_match = re.search(r"actual consumption\.\n(?P\d+)", text) + data["Primary Energy Use (kWh/yr)"] = int(primary_energy_match.group("primary_energy")) + # We calculate the primary energy use intensity by dividing by floor area + floor_area = re.search(r"Total Floor Area\s(?P\d+)\s?m2", text).group("floor_area") + data["Primary Energy Use Intensity (kWh/m2/yr)"] = data["Primary Energy Use (kWh/yr)"] / int(floor_area) + + # Extract age band + age_band_match = re.search( + r"Building part:\s*Main\s*-\s*built in\s*(?:[A-Z]\s*)?(\d{4}-\d{4}|before \d{4}|\d{4} onwards)", + text + ) + + data["Main Building Age Band"] = age_band_match.group(1) + + # Extract Number of Storeys + storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) + data["Number of Storeys"] = int(storeys_match.group(1)) + + # Extract Fuel Bill + fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text) + data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" + + # Extract Total Number of Doors + total_doors_match = re.search(r"Total Doors:\s*(\d+)", text) + data["Total Number of Doors"] = int(total_doors_match.group(1)) + + # Extract Number of Insulated Doors + insulated_doors_match = re.search(r"Insulated Doors:\s*(\d+)", text) + data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) + + # Extract Primary Heating Section (Main Heating 1) + primary_heating_section1 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL) + # We may not have a secondary heating + primary_heating_section2 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Secondary\s*Heating", text, re.DOTALL) + primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 + primary_text = primary_heating_section.group(1) + + data["Existing Primary Heating System"] = re.search( + r"Main Heating Code\s*(.*?)\n", primary_text + ).group(1).strip() + data["Existing Primary Heating PCDF Reference"] = re.search( + r"PCDF boiler Reference\s*(\d+)", primary_text + ).group(1) + data["Existing Primary Heating Controls"] = re.search( + r"Main Heating Controls\s*(.*?)\n", primary_text + ).group(1).strip() + data["Existing Primary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%?", primary_text).group(1) + ) + + # Extract Secondary Heating Section (Main Heating 2) + secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL) + if secondary_heating_section is None: + data["Existing Secondary Heating System"] = "" + data["Existing Secondary Heating PCDF Reference"] = "" + data["Existing Secondary Heating Controls"] = "" + data["Existing Secondary Heating % of Heat"] = 0 + + else: + secondary_text = secondary_heating_section.group(1) + + main_heating_code_match_secondary = re.search( + r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text + ) + data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip() + + data["Existing Secondary Heating PCDF Reference"] = re.search( + r"PCDF boiler Reference\s*(\d+)", secondary_text + ).group(1) + + if data["Existing Secondary Heating System"] == "": + data["Existing Secondary Heating Controls"] = "" + else: + # Might not have heating controls on 2nd system + secondary_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) + data["Existing Secondary Heating Controls"] = ( + secondary_controls_match.group(1).strip() if secondary_controls_match else "" + ) + data["Existing Secondary Heating % of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1) + ) + + # Extract Secondary Heating and Water Heating Codes + secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) + water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) + + if data["Existing Secondary Heating System"] == "": + data["Secondary Heating Code"] = "" + else: + data["Secondary Heating Code"] = secondary_heating_code_match.group( + 1).strip() if secondary_heating_code_match else "" + data["Water Heating Code"] = water_heating_code_match.group(1).strip() + + # Extract Windows information + windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) + if windows_section: + windows_text = windows_section.group(1) + window_data = extract_window_age_description(windows_text) + data.update(window_data) + + building_parts = extract_building_parts_epr(text) + data.update(building_parts) + + # Get number of lighting outlets and number of fittings needing LEL + lighting_fittings_match = re.search(r"Total number of light fittings\s*(\d+)", text) + data["Number of Light Fittings"] = int(lighting_fittings_match.group(1)) + lel_fittings_match = re.search(r"Total number of L.E.L. fittings\s*(\d+)", text) + data["Number of LEL Fittings"] = int(lel_fittings_match.group(1)) + data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + + roof_details = extract_roof_details_epr(text) + # Get from the main building + main_roof_details = [r for r in roof_details if "Main" in r["Building Part"]] + data["Main Roof Type"] = main_roof_details[0]["Roof Type"] + data["Main Roof Insulation"] = main_roof_details[0]["Roof Insulation"] + data["Main Roof Insulation Thickness"] = main_roof_details[0]["Roof Insulation Thickness"] + + wall_details = extract_wall_details_epr(text) + main_wall_details = [w for w in wall_details if "Main" in w["Building Part"]][0] + data["Main Wall Type"] = main_wall_details["Wall Type"] + data["Main Wall Insulation"] = main_wall_details["Wall Insulation"] + data["Main Wall Dry-lining"] = main_wall_details["Wall Dry-lining"] + data["Main Wall Thickness"] = main_wall_details["Wall Thickness"] + data["Main Building Alternative Wall Type"] = main_wall_details["Alternative Wall Type"] + data["Main Building Alternative Wall Insulation"] = main_wall_details["Alternative Wall Insulation"] + data["Main Building Alternative Wall Dry-lining"] = main_wall_details["Alternative Wall Dry-lining"] + data["Main Building Alternative Wall Thickness"] = main_wall_details["Alternative Wall Thickness"] + + return data + + +def detect_report_type(pdf_path, pdf_file): + """ + Detects the type of report based on content or filename. + :param pdf_path: String path to the PDF file + :param pdf_file: String name of the PDF file + :return: String type of the report ("epr", "summary", or None) + """ + # Attempt to read the first page of the PDF to determine type + with open(pdf_path, "rb") as file: + # This code raises some warnings like Multiple definitions in dictionary at byte 0x1ab for key /Filter + # This is because the pdf is irregular. We could possibly try a library like fitz to handle this + reader = PyPDF2.PdfReader(file) + first_page_text = reader.pages[0].extract_text() if reader.pages else "" + n_pages = len(reader.pages) + + if is_energy_report(first_page_text) and n_pages > 3: + # The EPR should have more than 3 pages + return "epr" + elif is_energy_report(first_page_text) and n_pages <= 3: + # This is a shortened version of the EPR which isn't massively useful + return "short_form_epr" + elif "summary" in pdf_file.lower() or is_summary_report(first_page_text): + return "summary" + elif is_condition_report(first_page_text): + return "condition" + + return None + + +def extract_retrofit_pdfs(data_folder_path): + """ + Handles extraction from a retrofit data folder if it exists and has content. + Prioritizes extracting data from an EPR if both EPR and summary report are present. + """ + retrofit_files = [f for f in os.listdir(data_folder_path) if f.endswith(".pdf")] + report_types = {"epr": None, "summary": None} + + # First, identify the types of reports available + for pdf_file in retrofit_files: + pdf_path = os.path.join(data_folder_path, pdf_file) + report_type = detect_report_type(pdf_path, pdf_file) + + if report_type == "epr": + report_types["epr"] = pdf_path + elif report_type == "summary": + report_types["summary"] = pdf_path + + # Stop checking further if both EPR and summary are found + if report_types["epr"] and report_types["summary"]: + break + + # Extract data based on report availability and priority + if report_types["epr"]: + return extract_epr(report_types["epr"]) + elif report_types["summary"]: + return extract_summary_report(report_types["summary"]) + + # If no relevant PDF is found, return None + return None + + +def is_energy_report(text): + """ + Determines if the provided text indicates that the PDF is an Energy Report. + Returns True if the text contains 'Energy Report'. + """ + return text.startswith("ENERGY REPORT") + + +def is_summary_report(text): + """ + Determines if the provided text indicates that the PDF is a Summary Report. + """ + return text.startswith("Summary Information") + + +def detect_and_parse_report(pdf_path, pdf_file): + """ + Detects the type of report and extracts the relevant data. + :param pdf_path: String path to the PDF file + :param pdf_file: String name of the PDF file + :return: + """ + # Attempt to read the first page of the PDF to determine type + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + first_page_text = reader.pages[0].extract_text() if reader.pages else "" + + if is_energy_report(first_page_text): + # Treat this as an Energy Report + return extract_epr(pdf_path) + elif "summary" in pdf_file.lower() or is_summary_report(first_page_text): + # Treat this as a Summary Report + return extract_summary_report(pdf_path) + elif is_condition_report(first_page_text): + return None + else: + raise NotImplementedError("Implement me") + + +def is_condition_report(text): + """ + Determines if the provided text indicates that the PDF is a Condition Report. + """ + return text.startswith("OsmosisACDNEWPAS2035ConditionReport") or text.startswith("OsmosisACDPAS2035ConditionReport") + + +def main(): + """ + This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater. + """ + # List only directories in the specified FILE_PATH + survey_folders = [] + + # Loop over each survey folder and list its contents + for i in range(1, NUM_FOLDERS + 1): + folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}") + if os.path.isdir(folder_path): # Check if folder exists + folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)] + survey_folders.extend(folder_contents) # Append contents to the master list + + # Get rid of .DS_Store files + survey_folders = [folder for folder in survey_folders if not folder.endswith(".DS_Store")] + + extracted_data = [] + for survey_folder in tqdm(survey_folders): + survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) + + # List the folders inside of the survey folder + survey_subfolders = [name for name in os.listdir(survey_folder_path) + if os.path.isdir(os.path.join(survey_folder_path, name))] + + # Check if there's a "retrofit assessment" folder + retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None) + + ra_folder = next( + (name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()), + None + ) + + # If retrofit assessment folder exists, check if it has content + if retrofit_folder or ra_folder: + if retrofit_folder: + retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) + else: + retrofit_folder_path = os.path.join(survey_folder_path, ra_folder) + + # Check if everything inside is a sub-folder and the number of folders is 2 + items = [item for item in os.listdir(retrofit_folder_path) if item != '.DS_Store'] + all_folders = [os.path.isdir(os.path.join(retrofit_folder_path, item)) for item in items] + if all(all_folders) and len(all_folders) == 2 and "Property Pics" in items: + # Get the folder that isn't Property Pics + retrofit_folder_path = os.path.join( + retrofit_folder_path, [item for item in items if item != "Property Pics"][0] + ) + + if os.listdir(retrofit_folder_path): # If not empty + summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path) + if summary_data: + summary_data = { + "survey_folder": survey_folder, + **summary_data, + } + extracted_data.append(summary_data) + continue + else: + # Then we have an empty Retrofit Assessment folder + continue + + # If no retrofit folder or it was empty, check files in survey_folder + + summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) + if not summary_data: + if len(survey_subfolders) == 1: + survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0]) + summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) + + if summary_data: + summary_data = { + "survey_folder": survey_folder, + **summary_data, + } + extracted_data.append(summary_data) + + extracted_data = pd.DataFrame(extracted_data) + + extracted_data["Primary Energy Use (kWh/yr)"] = ( + extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"] + ) + extracted_data["Current SAP Rating"] = extracted_data["Current SAP Rating"].astype(int) + extracted_data["Current EPC Band"] = extracted_data["Current SAP Rating"].apply(sap_to_epc) + + # Remove some definite duplicates + dupes = extracted_data[extracted_data["Address"].duplicated()]["Address"] + dupes = extracted_data[extracted_data["Address"].isin(dupes)] + dupes = dupes.sort_values("Address") + # Get all of the folders that end with ROSS + to_drop = dupes[dupes["survey_folder"].str.endswith("ROSS")]["survey_folder"].unique().tolist() + + extracted_data = extracted_data[ + ~extracted_data["survey_folder"].isin( + [ + "StonewaterSurveys_10/4 Beech Road, LUTON, LU1 1DP ROSS", + "StonewaterSurveys_2/135 Runley Road, LUTON, LU1 1TX ROSS", + "StonewaterSurveys_13/7 Saxon Road, LUTON, LU3 1JR ROSS" + ] + to_drop + ) + ] + + # We now merge on the coordinator data so that against each property, we can map the measures + # TODO: Get the pre & post primary energy numbers + # TODO: Make sure the numbers are going down + + retrofit_packages_board = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, + "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1732034933 Final 19.11.24.xlsx" + ), + header=4 + ) + retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])] + # Take just the rows that have been surveyed + retrofit_packages_board = retrofit_packages_board[ + retrofit_packages_board["RA"].isin(["Invoiced", "Completed"]) + ] + # populated_primary_energy = retrofit_packages_board[ + # ~pd.isnull(retrofit_packages_board['BASE Primary energy (13a-272)']) + # ] + # + # z = populated_primary_energy[ + # populated_primary_energy['POST Primary energy (13a - 272)'] > populated_primary_energy[ + # 'BASE Primary energy (13a-272)'] + # ] + # + # all(populated_primary_energy['POST Primary energy (13a - 272)'] <= populated_primary_energy[ + # 'BASE Primary energy (13a-272)']) + + # Replace \n with "" + extracted_data["Postcode"] = extracted_data["Postcode"].str.replace("\n", "") + + manual_filters = { + "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD", + "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG", + "2 Bromyard Road": "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ", + 'Flat 18, 1 Raglan Court': "StonewaterSurveys_13/60-3-18 Raglan Court, 1 Raglan Court-MK41 8QT", + '14 Raglan Court, 1 Devizes Avenue': 'StonewaterSurveys_12/55-3-14 Raglan Court, Devizes Avenue-MK41 8QT', + '19 South Road': 'StonewaterSurveys_4/19 The Oaks, South Road, SMETHWICK, B67 7BY', + 'Flat 12 Pelican Lane': 'StonewaterSurveys_1/121-3-Flat 12 Lynton Court, Pelican Lane-RG14 1NN', + 'Flat C, 44 St Leonards Avenue': 'StonewaterSurveys_11/427-2-44c St. Leonards Avenue-MK42 0RB', + '16 The Crescent, Kington': 'StonewaterSurveys_9/360-3-16 The Crescent-HR5 3AS', + '2 School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA', + '14 South Road': 'StonewaterSurveys_2/14 The Oaks, South Road, SMETHWICK, B67 7BY', + '1 Groves Street': 'StonewaterSurveys_4/19-5-1 Groves Street-SN2 2BW', + # '2 Sorrell Place': '', + # '72 St Ives Road': '', + # '1 The Close, Burton Gardens': '', + # '102 Cheaton Close': '', + # 'Flat 16 Spring Gardens': '', + # '4 Apple Close': '', + # '25 Folly Lane': '', + '2 Calshot Walk': 'StonewaterSurveys_3/156-3-2 Calshot Walk-MK41 8QS', + '21 Constitution Hill': 'StonewaterSurveys_1/112-11-21 Constitution Hill-BH14 0PX', + '22 Constitution Hill': 'StonewaterSurveys_4/185-8-22 Constitution Hill-BH14 0PX', + '2 Marches Cottages, School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA', + '26, Copthorn House, Brighton Road': 'StonewaterSurveys_15/133-1-26 Brighton Road-KT20 6BQ', + '4, Old St Marys, Ripley Lane': "StonewaterSurveys_15/433-3-4 Ripley Lane-KT24 6JG", + '1 Nelson House, Short Street': 'StonewaterSurveys_15/89-2-1 Short Street-GU11 1HX', + "18 Nelson House, Short Street": 'StonewaterSurveys_15/25-3- 18 Short Street- GU11 1HX', + '3 Nelson House, Short Street': 'StonewaterSurveys_2/138-1-3 Short Street-GU11 1HX', + '16, Copthorn House, Brighton Road': 'StonewaterSurveys_13/78-3-16 Brighton Road-KT20 6BQ', + '20 Nelson House, Short Street': 'StonewaterSurveys_15/89-1-20 Short Street-GU11 1HX', + '7 Croft Street': 'StonewaterSurveys_8/333-2-7 Croft Street-HR6 8LA' + } + + # We now match this retrofit packages board to the extracted data + matching_lookup = [] + for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)): + + # Handle the case that has the wrong postcode in the asset data + if home["Name"] in manual_filters: + filtered = extracted_data[extracted_data["survey_folder"] == manual_filters[home["Name"]]].copy() + else: + filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy() + + # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False + ) + if to_filter.sum() == 0: + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.replace(",", "").str.replace(".", + "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False + ) + filtered = filtered[to_filter] + + if filtered.empty: + continue + + if filtered.shape[0] == 1: + matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Address ID": home["Address ID"], + "Name": home["Name"] + } + ) + continue + + # home["Name"] should be contained in the survey_folder + filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)] + # We have an edge case wher some properties have two outputs in Sharepoint + if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": + raise Exception("Fix me1") + # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] + + if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB': + raise Exception("Fix me2") + # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] + + if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ': + filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"] + + if filtered.empty: + continue + if filtered.shape[0] != 1: + raise Exception("something went wrong") + + matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Address ID": home["Address ID"], + "Name": home["Name"] + } + ) + + matching_lookup = pd.DataFrame(matching_lookup) + # Find Osmosis IDs that are in the packages board but not in the matching looking + missing_ids = set(retrofit_packages_board["Address ID"]) - set(matching_lookup["Address ID"]) + missing_ids = list(missing_ids) + if missing_ids: + # We check that the missing ids have no data yet + # missed = retrofit_packages_board[retrofit_packages_board["Address ID"].isin(missing_ids)] + # missed[["Name", "Postcode", "Archetype ID", "Arch. Group Rank"]].to_csv( + # CUSTOMER_FOLDER_PATH + "/missed_debugging.csv") + + if len(missing_ids) != 1: + raise Exception("Unacceptable number of missings") + + if matching_lookup["Address ID"].duplicated().sum(): + raise Exception("Duplicate Address IDs") + + if matching_lookup["survey_folder"].duplicated().sum(): + raise Exception("Duplicate survey folders") + + measure_columns = [ + 'Main Wall Insulation', + 'Secondary Wall Insulation', + 'Loft insulation', + 'Flat Roof', + 'Room in Roof', + 'Window Upgrade', + 'Door Upgrade', + 'Ventilation', + 'Main Heating', + 'Water Heating', + 'Heating Controls', + 'Solar PV', + 'Other measures' + ] + + # We should end up with a 1:1 mapping between the Osm. ID and the survey folder + stonewater_data = extracted_data.merge(matching_lookup, on="survey_folder", how="inner").merge( + retrofit_packages_board[ + [ + "Name", + "RA", + "Address ID", + "Archetype ID", + "Arch. Group Rank", + "Actual SAP Band", + "Actual SAP Rating", + "Modelled SAP Band", + "Modelled SAP Rating", + "Package Ref", + ] + measure_columns + ], + on=["Address ID", "Name"], + how="left" + ) + + if stonewater_data["Address ID"].duplicated().sum(): + raise Exception("Duplicate Address IDs") + # Create a section for costs + for measure in measure_columns: + stonewater_data[f"Cost of {measure}"] = None + + stonewater_data["Total Cost of Measures"] = None + stonewater_data["Contingency Cost"] = None + stonewater_data["Total Cost of Measures inc Contingency"] = None + + # We've appended the recommended packages and modelled SAP ratings to the data + # We also want to append the windows data + windows_data = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, + "Window data included AP Copy Stonewater SHDF_3_0_Board Triage Master Filtered 26.07.24.xlsx" + ), + header=12 + ) + + windows_data = windows_data[windows_data["Address ID"] != "Address ID"] + windows_data = windows_data[~pd.isnull(windows_data["Address ID"])] + + # We get a lookup id of Osm.ID and when the windows were fitted + windows_data = windows_data[ + ["Address ID", "Window attributes - Fitted/renewed date", + "Parent Asset Window attributes - Fitted/renewed date"] + ] + # Convert to string for the moment + windows_data["Parent Asset Window attributes - Fitted/renewed date"] = windows_data[ + "Parent Asset Window attributes - Fitted/renewed date" + ].astype(str) + # Create a single date column + windows_data["Fitted/renewed date"] = np.where( + pd.notnull(windows_data["Window attributes - Fitted/renewed date"]), + windows_data["Window attributes - Fitted/renewed date"], + windows_data["Parent Asset Window attributes - Fitted/renewed date"] + ) + # Convert to a date + windows_data["Fitted/renewed date"] = pd.to_datetime(windows_data["Fitted/renewed date"]) + # Calculate the number of years since something was done on the windows + windows_data["Years since fitted/renewed"] = (pd.Timestamp.now() - windows_data[ + "Fitted/renewed date"]).dt.days / 365 + + stonewater_data["Package Includes Windows"] = ~pd.isnull(stonewater_data["Window Upgrade"]) + windows_data["Address ID"] = windows_data["Address ID"].astype(float) + stonewater_data = stonewater_data.merge(windows_data, on="Address ID", how="left") + stonewater_data = stonewater_data.sort_values("Archetype ID", ascending=True) + + if stonewater_data["Address ID"].duplicated().sum(): + raise Exception("Duplicate Address IDs") + + for c in [ + 'Window attributes - Fitted/renewed date', + 'Parent Asset Window attributes - Fitted/renewed date', + 'Fitted/renewed date' + ]: + stonewater_data[c] = stonewater_data[c].astype(str) + + # FIll the primary energy numbers from the excel + stonewater_data = stonewater_data.merge( + retrofit_packages_board[ + [ + "Name", "Address ID", "BASE Primary energy (13a-272)", "POST Primary energy (13a - 272)" + ] + ], + on=["Address ID", "Name"], + how="left" + ) + stonewater_data["Primary Energy Use (kWh/yr)"] = np.where( + pd.isnull(stonewater_data["Primary Energy Use (kWh/yr)"]), + stonewater_data["BASE Primary energy (13a-272)"], + stonewater_data["Primary Energy Use (kWh/yr)"] + ) + stonewater_data = stonewater_data.drop(columns=["BASE Primary energy (13a-272)"]) + + # Add on organisation reference + original_archetypes = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " + "- Archetyped V3.1.xlsx", + header=4 + ) + original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])] + original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"] + original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int) + + stonewater_data = stonewater_data.merge( + original_archetypes[["Address ID", 'Org. ref.']], + on="Address ID", + how="left" + ) + + # Save this data to excel + stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V4.xlsx", index=False) + + cost_sheet = [ + { + "measure": "EWI 0.30 w.m2.K", "cost": 298.35, "unit": "m2" + }, + { + "measure": "CWI RdSAP Default", "cost": 14.21, "unit": "m2" + }, + { + "measure": "Poss Extract CWI & Refill (issues identified)", "cost": 14.21 + 25, "unit": "m2" + }, + { + "measure": "IWI 0.30 w.m2.K", "cost": 244.80, "unit": "m2" + }, + { + "measure": "EWI/IWI 0.3", "cost": (298.35 + 244.8) / 2, "unit": "m2" + }, + { + "measure": "Loft Insulation 0.11 w.m2.K", "cost": 16.07, "unit": "m2" + }, + { + "measure": "Flat Roof 0.11 w.m2.K", "cost": 195, "unit": "m2" + }, + { + "measure": "DG Window 1.30 w.m2.K", "cost": 1140, "unit": "each" + }, + { + "measure": "Secondary 2.40", "cost": 974, "unit": "each" + }, + { + "measure": "Ins. Door 1.30 w.m2.K", "cost": None, "unit": "each" + }, + { + "measure": "Ins. Door 1.40 w.m2.K", "cost": None, "unit": "each" + }, + { + "measure": "DMEV", "cost": 900, "unit": "each" + }, + { + "measure": "ASHP Vaillant 102607 5kw", "cost": None, "unit": "each" + }, + { + "measure": "HHRSH Quantum 150", "cost": None, "unit": "each" + }, + { + "measure": "Dual Stat Tank 210lt 50mm Foam", "cost": None, "unit": "each" + }, + { + "measure": "Dual Stat Tank 160lt 50mm Foam", "cost": None, "unit": "each" + }, + { + "measure": "Dual Stat Tank 110lt 50mm Foam", "cost": None, "unit": "each" + }, + { + "measure": "Smart Thermostat", "cost": 1200, "unit": "each" + }, + { + "measure": "TRV's", "cost": 350, "unit": "each" + }, + { + "measure": "Solar PV - 3.0kwp", "cost": 4365.0, "unit": "each" + }, + { + "measure": "Solar PV - 1.5kwp", "cost": 3881, "unit": "each" + }, + { + "measure": "LEL", "cost": 35, "unit": "per bulb" + }, + { + "measure": "Roof 0.16 - Walls 0.30", "cost": 180, "unit": "floor area m2" + }, + { + "measure": "Roof 0.16 - Walls 0.16", "cost": 180, "unit": "floor area m2" + }, + ] + cost_sheet = pd.DataFrame(cost_sheet) + + # Save cost sheet - ideally this will be used as a secondary sheet for Stonewater + cost_sheet.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - cost sheet.xlsx", index=False) + + # stonewater_data[~pd.isnull(stonewater_data["Room in Roof"])]["survey_folder"].values + + create_proposed_wave_3_bid( + costed_packages_filepath=os.path.join( + CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP) Single Model V3.xlsx" + ), + archetypes_sheet_filepath=os.path.join( + CUSTOMER_FOLDER_PATH, "Stonewater SHDF_3_0_Board Triage 22.05.24 - Archetyped V3.1.xlsx" + ) + ) + + +def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepath): + # We read in the costed packages + costed_packages = pd.read_excel(costed_packages_filepath, header=13, sheet_name="Modelled Packages") + costed_packages = costed_packages[~pd.isnull(costed_packages["Address"])] + + archetypes_to_cost = costed_packages[ + [ + "Name", "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Modelled SAP Band", + "Modelled SAP Rating", "Package Ref", 'Total Cost of Measures', 'Contingency Cost', + 'Total Cost of Measures inc Contingency', 'Main Roof Type', 'Main Roof Insulation', + 'Main Roof Insulation Thickness', 'Existing Primary Heating System', + 'Existing Primary Heating PCDF Reference' + ] + ].copy() + + # Combine 'Main Roof Type', 'Main Roof Insulation', 'Main Roof Insulation Thickness', separating by colons! + archetypes_to_cost['Surveyed Main Roof'] = ( + archetypes_to_cost['Main Roof Type'] + ': ' + archetypes_to_cost['Main Roof Insulation'] + ': ' + + archetypes_to_cost['Main Roof Insulation Thickness'].astype(str) + ) + + # Combine the heating systems, separating by colons! + archetypes_to_cost['Surveyed Main Heating'] = ( + archetypes_to_cost['Existing Primary Heating System'] + ': code - ' + archetypes_to_cost[ + 'Existing Primary Heating PCDF Reference'].astype(str) + ) + + archetypes_to_cost = archetypes_to_cost.drop( + columns=['Main Roof Type', 'Main Roof Insulation', 'Main Roof Insulation Thickness', + 'Existing Primary Heating System', + 'Existing Primary Heating PCDF Reference']) + + # We take properties that are EPC D and below (59% of units) + archetypes_to_cost = archetypes_to_cost[archetypes_to_cost["Current EPC Band"].isin(["D", "E", "F", "G"])] + + archetypes_to_cost["Has been modelled"] = ~pd.isnull(archetypes_to_cost["Modelled SAP Band"]) + + # These are the Arhetypes that will likely be suitable for Wave 3 + archetypes_sheet = pd.read_excel(archetypes_sheet_filepath, header=4) + archetypes_sheet = archetypes_sheet[~pd.isnull(archetypes_sheet["Address ID"])] + archetypes_sheet = archetypes_sheet[archetypes_sheet["Address ID"] != "Address ID"] + archetypes_sheet["Address ID"] = archetypes_sheet["Address ID"].astype(int) + + # We merge the property details onto the costed archetypes + archetypes_to_cost = archetypes_to_cost.merge( + archetypes_sheet[["Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]], + on="Address ID", + how="left" + ) + + proposed_sample = archetypes_sheet[ + archetypes_sheet["Archetype ID"].astype(str).isin(archetypes_to_cost["Archetype ID"].astype(int).astype(str)) + ] + + not_proposed = archetypes_sheet[ + ~archetypes_sheet["Archetype ID"].astype(str).isin(archetypes_to_cost["Archetype ID"].astype(int).astype(str)) + ] + + # archetypes_without_survey = [] + # for p in list(set(not_proposed)): + # filtered = costed_packages[costed_packages["Archetype ID"].astype(int).astype(str) == p] + # if filtered.empty: + # archetypes_without_survey.append(p) + + # Can we propose anything about archetypes that were not surveyed? + + proposed_sample = proposed_sample[ + [ + "Name", "Postcode", "UPRN", "UDPRN", "Address ID", "Osm. ID", "Archetype ID", + "Property Type", "Wall Type", "Roof Type", "Heating" + ] + ] + + # We classify into high and low confidence + + archetypes_to_cost["Surveyed Main Roof"] = archetypes_to_cost["Surveyed Main Roof"].fillna("") + + match_classification = [] + for _, home in tqdm(proposed_sample.iterrows(), total=len(proposed_sample)): + + surveyed = archetypes_to_cost[archetypes_to_cost["Archetype ID"] == home["Archetype ID"]].copy() + surveyed["Package Ref"] = surveyed["Package Ref"].astype(str) + + package = " or ".join(sorted([x for x in surveyed["Package Ref"].unique() if x.strip()])) + package = package.replace("\n", "") + + surveyed_roofs = " or ".join(sorted([x for x in surveyed["Surveyed Main Roof"].unique() if x.strip()])) + surveyed_roofs = surveyed_roofs.replace("\n", "") + + surveyed_heating = " or ".join(sorted([x for x in surveyed["Surveyed Main Heating"].unique() if x.strip()])) + surveyed_heating = surveyed_heating.replace("\n", "") + + # We now check if we have a perfect match + surveyed = surveyed[ + (surveyed["Property Type"] == home["Property Type"]) & + (surveyed["Wall Type"] == home["Wall Type"]) & + (surveyed["Roof Type"] == home["Roof Type"]) & + (surveyed["Heating"] == home["Heating"]) + ] + + if surveyed.empty: + if package == "2B2A": + raise Exception("Fix me") + match_classification.append( + { + "Address ID": home["Address ID"], + "Match to Surveyed": "Approximate", + "Proposed Package Ref": package, + "Surveyed Archetype Roofs": surveyed_roofs, + "Surveyed Archetype Heating": surveyed_heating + } + ) + continue + # Re-do + package = " or ".join(sorted([x for x in surveyed["Package Ref"].unique() if x.strip()])) + package = package.replace("\n", "") + surveyed_roofs = " or ".join(sorted([x for x in surveyed["Surveyed Main Roof"].unique() if x.strip()])) + surveyed_roofs = surveyed_roofs.replace("\n", "") + surveyed_heating = " or ".join(sorted([x for x in surveyed["Surveyed Main Heating"].unique() if x.strip()])) + surveyed_heating = surveyed_heating.replace("\n", "") + + match_classification.append( + { + "Address ID": home["Address ID"], + "Match to Surveyed": "Exact", + "Proposed Package Ref": package, + "Surveyed Archetype Roofs": surveyed_roofs, + "Surveyed Archetype Heating": surveyed_heating + } + ) + + match_classification = pd.DataFrame(match_classification) + + proposed_sample = proposed_sample.merge( + match_classification, + on="Address ID", + how="left", + ) + + # Merge on the cost per archetype + cost_per_archetype = ( + archetypes_to_cost.groupby("Archetype ID")[['Total Cost of Measures inc Contingency']].mean().reset_index() + ) + proposed_sample = proposed_sample.merge( + cost_per_archetype, + on="Archetype ID", + how="left" + ) + + # We add on a boolean to indicate if a property from that archetype has been modelled + proposed_sample = proposed_sample.merge( + archetypes_to_cost.groupby("Archetype ID")[["Has been modelled"]].any().reset_index(), + on="Archetype ID", + how="left" + ) + + proposed_sample["Total Cost of Measures inc Contingency"] = np.where( + ~proposed_sample["Has been modelled"], + None, proposed_sample["Total Cost of Measures inc Contingency"] + ) + + proposed_sample = proposed_sample.sort_values("Archetype ID", ascending=True) + + # Save excel + proposed_sample.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - Proposed Wave 3 Bid V2 (WIP).xlsx", index=False) + + # For each postcode that's in the bid, we also summarise the number of units in the bid and number left out + proposed_sample_postcodes = proposed_sample["Postcode"].unique() + + postcode_summary = [] + for postcode in proposed_sample_postcodes: + in_proposal = proposed_sample[proposed_sample["Postcode"] == postcode] + not_in_proposal = not_proposed[not_proposed["Postcode"] == postcode] + postcode_summary.append( + { + "Postcode": postcode, + "Number of properties in Proposal": len(in_proposal), + "Number of properties not in Proposal": len(not_in_proposal) + } + ) + postcode_summary = pd.DataFrame(postcode_summary) + postcode_summary = postcode_summary.sort_values( + "Number of properties not in Proposal", + ascending=False).reset_index(drop=True) + + postcode_summary.to_excel( + CUSTOMER_FOLDER_PATH + "/Stonewater - Proposed Wave 3 Bid Postcode Summary.xlsx", index=False + ) + + +def find_remaining_surveys(): + """ + This compares a list of properties that have been surveyed against a list of properties that I have produced + costed retrofit packages for, so I know what needs to be downloaded from Sharepoint + :return: + """ + + surveyed = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater" + "/Stonewater_SHDF_3_0_Board_work_in_progress_- 07.11.24.xlsx", + header=4 + ) + + costed = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater - Costed Retrofit Packages " + "20241030 (WIP) MR Review v1.xlsx", + header=13, + sheet_name="Modelled Packages" + ) + costed = costed[~pd.isnull(costed["Address ID"])] + + needed = surveyed[~surveyed["Address ID"].isin(costed["Address ID"])] + + needed["id"] = needed["Archetype ID"].astype(str) + "-" + needed["Arch. Group Rank"].astype(str) + needed = needed.sort_values("id", ascending=True) + needed[["id", "Name", "Postcode"]].to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/needed_surveys.csv" + ) + + assert needed.shape[0] + costed.shape[0] == surveyed.shape[0] + + +def append_stonewater_id(): + """ + This completes an adhoc request from Stonewater to add in their organisation Reference onto the model + :return: + """ + + model_proposed_sample = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater - Bid Packages WIP 13.11.24.xlsx", + sheet_name="Modelled Packages", + header=13 + ) + model_proposed_sample = model_proposed_sample[~pd.isnull(model_proposed_sample["Address ID"])] + model_proposed_sample["Address ID"] = model_proposed_sample["Address ID"].astype(int) + + original_archetypes = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " + "- Archetyped V3.1.xlsx", + header=4 + ) + original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])] + original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"] + original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int) + + matched = model_proposed_sample.merge( + original_archetypes[["Address ID", 'Org. ref.']], + on="Address ID", + how="left" + ) + + if pd.isnull(matched["Org. ref."]).sum(): + raise ValueError("Something went wrong") + + # Save as CSV + matched.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater IDs.xlsx", + sheet_name="Proposed Wave 3 Sample", + index=False + ) + + +def propsed_wave_3_sample(): + """ + Stonewater want to ensure that the properties that when selecting properties for wave 3, they choose properties + such that most of the properties within a geographical area are treatable within the bid. + Name, if we take a geographical area (which could be postal region) they want the most, and ideally all, of the + properties within that geographical area to be included within the bid + :return: + """ + + asset_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " + "- Archetyped V3.1.xlsx", + header=4 + ) + + # TODO: We drop 7 properties missing + # UPRN + asset_list = asset_list[~asset_list["Archetype ID"].isin(["MISSING UPRN"])] + # Clean address ids + asset_list = asset_list[~pd.isnull(asset_list["Address ID"])] + asset_list = asset_list[asset_list["Address ID"] != "Address ID"] + asset_list["Address ID"] = asset_list["Address ID"].astype(int) + + asset_list["Street name"] = np.where( + pd.isnull(asset_list["Street name"]), + asset_list["Postcode"], + asset_list["Street name"] + ) + + # Create the postal region, taking the first part of the postcode + asset_list["Postal Region"] = asset_list["Postcode"].str.split(" ").str[0] + asset_list["Street and Region"] = asset_list["Street name"] + " " + asset_list["Postal Region"] + unique_postal_regions = asset_list["Postal Region"].unique() + + # Keep just the columns we need + asset_list = asset_list[ + ["UPRN", "Address ID", 'Org. ref.', "Archetype ID", "Postal Region", "Name", "Postcode", "Street and Region", + "Property Type", "Wall Type", "Roof Type", "Heating"] + ] + + survey_results = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.19 V2.xlsx"), + header=13, + sheet_name="Modelled Packages" + ) + + survey_results = survey_results[ + [ + "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode", + "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness", + "Existing Primary Heating System", + "Package Ref", + "Main Wall Type", "Main Wall Insulation Type", "Main Wall Thickness", + "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation", + "Main Building Alternative Wall Thickness" + ] + ].rename( + columns={ + "Existing Primary Heating System": "Survey: Primary Heating System" + } + ) + + survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0] + # Concatenate from the wall information + survey_results["Survey: Main Wall Type"] = survey_results["Main Wall Type"].astype(str) + ": " + survey_results[ + "Main Wall Insulation Type"].astype(str) + # Alternative wall + survey_results["Survey: Main Alternative Wall"] = ( + survey_results["Main Building Alternative Wall Type"].astype(str) + ": " + survey_results[ + "Main Building Alternative Wall Insulation"].astype(str) + ) + # Roof information + survey_results["Survey: Main Roof Type"] = survey_results["Main Roof Type"].astype(str) + ": " + survey_results[ + "Main Roof Insulation"].astype(str) + ": " + survey_results["Main Roof Insulation Thickness"].astype(str) + + # Drop the individual columns: + survey_results = survey_results.drop( + columns=[ + "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness", + "Main Wall Type", "Main Wall Insulation Type", + "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation" + ] + ) + + survey_results_with_original_features = survey_results.merge( + asset_list[["UPRN", "Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]], + on="Address ID", + how="left" + ) + + if survey_results_with_original_features.shape[0] != survey_results.shape[0]: + raise ValueError("Something went wrong") + + # Against properties that have NO package ref, we assign a package ref + properties_with_packages = survey_results_with_original_features[ + ~pd.isnull(survey_results_with_original_features["Package Ref"]) + ] + + properties_without_packages = survey_results_with_original_features[ + (survey_results_with_original_features["Current SAP Rating"] < 69) & pd.isnull( + survey_results_with_original_features["Package Ref"] + ) + ] + + # Change this to a lookup + package_ratings = pd.DataFrame([ + { + "1A": 1, + "1B": 2, + "2A": 3, + "2B": 4, + "3A": 5, + "3B": 6, + 4: 7 + } + ]) + package_ratings = pd.melt(package_ratings, var_name="Package Ref", value_name="Rank") + + mapped_package_refs = [] + for _, property in tqdm(properties_without_packages.iterrows(), total=len(properties_without_packages)): + # Same archetype? + matches = properties_with_packages[properties_with_packages["Archetype ID"] == property["Archetype ID"]] + + if matches.empty: + # Similar property + matches = properties_with_packages[ + (properties_with_packages["Property Type"].str.split(":").str[0] == + property["Property Type"].split(":")[0]) & + (properties_with_packages["Wall Type"] == property["Wall Type"]) & + (properties_with_packages["Roof Type"].str.split(":").str[0] == property["Roof Type"].split(":")[0]) & + (properties_with_packages["Heating"].str.split(":").str[0] == property["Heating"].split(":")[0]) + ] + if matches.empty: + matches = properties_with_packages[ + (properties_with_packages["Property Type"].str.split(":").str[0] == + property["Property Type"].split(":")[0]) & + (properties_with_packages["Wall Type"].str.split(":").str[0] == property["Wall Type"].split(":")[0]) & + (properties_with_packages["Roof Type"].str.split(":").str[0] == property["Roof Type"].split(":")[0]) & + (properties_with_packages["Heating"].str.split(":").str[0] == property["Heating"].split(":")[0]) + ] + if matches.empty: + raise Exception("Implement me") + if matches.shape[0] > 1: + # Take the package with the highest rank + matches = matches.merge( + package_ratings, + on="Package Ref", + how="left" + ).sort_values("Rank", ascending=False).head(1) + + mapped_package_refs.append( + { + "Address ID": property["Address ID"], + "Matched Package Ref": matches["Package Ref"].values[0] + } + ) + + mapped_package_refs = pd.DataFrame(mapped_package_refs) + + survey_results = survey_results.merge( + mapped_package_refs, + on="Address ID", + how="left" + ) + survey_results["Package Ref"] = np.where( + pd.notnull(survey_results["Matched Package Ref"]), + survey_results["Matched Package Ref"], + survey_results["Package Ref"] + ) + survey_results = survey_results.drop(columns=["Matched Package Ref"]) + + # Do the same with survey_results_with_original_features + survey_results_with_original_features = survey_results_with_original_features.merge( + mapped_package_refs, + on="Address ID", + how="left" + ) + survey_results_with_original_features["Package Ref"] = np.where( + pd.notnull(survey_results_with_original_features["Matched Package Ref"]), + survey_results_with_original_features["Matched Package Ref"], + survey_results_with_original_features["Package Ref"] + ) + survey_results_with_original_features = survey_results_with_original_features.drop(columns=["Matched Package Ref"]) + + # Save the data for reference + # mapped_package_refs = mapped_package_refs.merge( + # asset_list[["Name", "Postcode", "Address ID", "Org. ref."]], + # on="Address ID", + # how="left" + # ) + # mapped_package_refs.to_csv(os.path.join(CUSTOMER_FOLDER_PATH, "mapped_package_refs.csv"), index=False) + + # We get longitude & Latitude + archetyping_spatial_features = read_pickle_from_s3( + bucket_name="retrofit-data-dev", s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl", + ) + archetyping_spatial_features = pd.concat(archetyping_spatial_features) + archetyping_spatial_features = archetyping_spatial_features[["UPRN", 'LATITUDE', 'LONGITUDE']].rename( + columns={"LATITUDE": "latitude", "LONGITUDE": "longitude"} + ) + # Merge them onto both datasets + asset_list = asset_list.merge( + archetyping_spatial_features, how="left", on="UPRN" + ) + if pd.isnull(asset_list["longitude"]).sum(): + raise ValueError("Something went wrong") + + survey_results_with_original_features = survey_results_with_original_features.merge( + archetyping_spatial_features, how="left", on="UPRN" + ) + if pd.isnull(survey_results_with_original_features["longitude"]).sum(): + raise ValueError("Something went wrong") + + def haversine(lat1, lon1, lat2, lon2): + # Radius of Earth in meters + R = 6371000 + + # Convert degrees to radians + lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2]) + + # Differences + dlat = lat2 - lat1 + dlon = lon2 - lon1 + + # Haversine formula + a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2 + c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)) + distance = R * c + return distance + + # Tier definitions + # Tier 1: We have a property in the same postal region and same archetype that was surveyed and is below EPC D + # Tier 2: We have a property in the same archetype that was surveyed and is below EPC D + # + + def match_property_to_surveyed(property, survey_results_with_original_features): + surveyed = survey_results_with_original_features[ + ( + survey_results_with_original_features["Postal Region"] == + property["Postal Region"] + ) & + ( + survey_results_with_original_features["Property Type"] == + property["Property Type"] + ) + & + ( + survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + property["Wall Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Heating"].str.split(":").str[0] == + property["Heating"].split(":")[0] + ) + ].copy() + + if not surveyed.empty: + return surveyed + + surveyed = survey_results_with_original_features[ + ( + survey_results_with_original_features["Postal Region"] == + property["Postal Region"] + ) & + ( + survey_results_with_original_features["Property Type"].str.split(":").str[0] == + property["Property Type"].split(":")[0] + ) + & + ( + survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + property["Wall Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Heating"].str.split(":").str[0] == + property["Heating"].split(":")[0] + ) + ].copy() + + # surveyed = survey_results_with_original_features[ + # ( + # survey_results_with_original_features["Property Type"] == + # property["Property Type"] + # ) & + # ( + # survey_results_with_original_features["Wall Type"] == + # property["Wall Type"] + # ) & + # ( + # survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + # property["Roof Type"].split(":")[0] + # ) & + # ( + # survey_results_with_original_features["Heating"] == + # property["Heating"] + # ) + # ].copy() + + if not surveyed.empty: + return surveyed + + surveyed = survey_results_with_original_features[ + ( + survey_results_with_original_features["Property Type"] == + property["Property Type"] + ) & + ( + survey_results_with_original_features["Wall Type"] == + property["Wall Type"] + ) & + ( + survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Heating"].str.split(":").str[0] == + property["Heating"].split(":")[0] + ) + ].copy() + + return surveyed + + def fill_survey_columns(region_assets, suffix): + for col in [ + 'Current EPC Band', 'Current SAP Rating', + 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', + 'Survey: Main Roof Type', 'Survey: Primary Heating System', + 'Survey: Matching Address ID', 'Distance to Closest Match (m)', + "Package Ref" + ]: + region_assets[col] = np.where( + pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + suffix]), + region_assets[col + suffix], region_assets[col] + ) + return region_assets + + survey_attribute_columns = [ + "Survey: Main Wall Type", 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', + 'Survey: Primary Heating System' + ] + + survey_results["Survey: Matching Address ID"] = survey_results["Address ID"].copy() + + results = [] + for region in tqdm(unique_postal_regions): + # Take all of the properties in that region + region_assets = asset_list[asset_list["Postal Region"] == region].copy() + + # We have a tier 1 match if the property itself was surveyed + exact_surveyed = survey_results[ + survey_results["Address ID"].isin(region_assets["Address ID"]) + ] + + region_assets = region_assets.merge( + exact_surveyed[ + ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns + [ + "Survey: Matching Address ID", "Package Ref" + ] + ], + on="Address ID", + how="left" + ) + region_assets['Distance to Closest Match (m)'] = None + region_assets["Distance to Closest Match (m)"] = np.where( + ~pd.isnull(region_assets["Current EPC Band"]), + 0, + region_assets["Distance to Closest Match (m)"] + ) + + # Label the tier 1 properties + region_assets["Confidence Tier"] = None + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band"].isin(["D", "E", "F", "G"]), + "1 - property was surveyed", region_assets["Confidence Tier"] + ) + + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band"].isin(["C", "B", "A"]), + "5 - property was surveyed", region_assets["Confidence Tier"] + ) + + archetype_ids = region_assets[ + pd.isnull(region_assets["Confidence Tier"]) + ]["Archetype ID"].unique() + # We get the properties that have been surveyed + + region_surveyed = [] + for arch_id in archetype_ids: + for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows(): + archetype_data = survey_results_with_original_features[ + survey_results["Archetype ID"] == arch_id + ].copy() + if archetype_data.empty: + continue + + match_type = "2 - same archetype" + if any(archetype_data["Postal Region"] == property["Postal Region"]): + match_type = "1 - same archetype, same postal region" + archetype_data = archetype_data[ + archetype_data["Postal Region"] == property["Postal Region"] + ] + + if archetype_data.shape[0] > 1: + # Look for an exact match, or as close as possible + archetype_data_filtered = match_property_to_surveyed(property, archetype_data) + if not archetype_data_filtered.empty: + archetype_data = archetype_data_filtered + + archetype_data["distance_meters"] = haversine( + lat1=property.latitude, lon1=property.longitude, + lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values + ) + expected_sap = np.average( + archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1) + ) + expected_epc = sap_to_epc(expected_sap) + + archetype_data = archetype_data.sort_values("distance_meters", ascending=True) + + # We take the features of the closest matching property + closest_match = archetype_data.iloc[0] + + # Set the package ref + if expected_epc in ["C", "B", "A"]: + package_ref = None + else: + package_ref = archetype_data["Package Ref"].dropna().values[0] + + region_surveyed.append( + { + "Archetype ID": arch_id, + "Address ID": property["Address ID"], + "Current EPC Band": expected_epc, + "Current SAP Rating": expected_sap, + 'Survey: Main Wall Type': closest_match["Survey: Main Wall Type"], + 'Survey: Main Alternative Wall': closest_match["Survey: Main Alternative Wall"], + 'Survey: Main Roof Type': closest_match["Survey: Main Roof Type"], + 'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"], + "Survey: Matching Address ID": closest_match["Address ID"], + 'Distance to Closest Match (m)': closest_match["distance_meters"], + "Package Ref": package_ref, + "Match Type": match_type + } + ) + region_surveyed = pd.DataFrame(region_surveyed) + + if region_surveyed.empty: + region_surveyed = pd.DataFrame( + columns=[ + "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating", + 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type', + 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)', + "Match Type", "Package Ref" + ] + ) + + starting_shape = region_assets.shape[0] + region_assets = region_assets.merge( + region_surveyed, + on=["Archetype ID", "Address ID"], + how="left", + suffixes=("", "_method1") + ) + if region_assets.shape[0] != starting_shape: + raise ValueError("Something went wrong") + + # Label the tier 1 properties + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band_method1"].isin(["D", "E", "F", "G"]) & + pd.isnull(region_assets["Confidence Tier"]) & ~pd.isnull(region_assets["Match Type"]), + region_assets["Match Type"], region_assets["Confidence Tier"] + ) + + # Handle EPC C + region_assets["Confidence Tier"] = np.where( + region_assets["Current EPC Band_method1"].isin(["C", "B", "F", "G"]) & + pd.isnull(region_assets["Confidence Tier"]), + "5 - EPC C or above", region_assets["Confidence Tier"] + ) + + region_assets = fill_survey_columns(region_assets, suffix="_method1") + + method_1_columns = [c for c in region_assets.columns if c.endswith("_method1")] + region_assets = region_assets.drop(columns=method_1_columns + ["Match Type"]) + + missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist() + + if not missed_addressids: + results.append(region_assets) + continue + + # This means that this archetype was never surveyed and so we need to find a sufficiently similar property + final_missed_matches = [] + for a_id in missed_addressids: + + match_type = "3 - compared to similar properties" + + property = asset_list[asset_list["Address ID"] == a_id].squeeze() + + surveyed = match_property_to_surveyed(property, survey_results_with_original_features) + + if surveyed.empty: + match_type = "3 - compared to similar properties, relaxed" + # In this case, we do one additional check where we filter on everything the same apart from heating, + # where we do a slightly more rough match + surveyed = survey_results_with_original_features[ + ( + survey_results_with_original_features["Property Type"].str.split(":").str[0] == + property["Property Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + property["Wall Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0] + ) + ].copy() + + if surveyed.empty: + if property["Property Type"].split(":")[0] in ["House", "Bungalow", "Maisonette"]: + filter_property_types = ["House", "Bungalow", ] + else: + filter_property_types = ["Flat"] + surveyed = survey_results_with_original_features[ + ( + survey_results_with_original_features["Property Type"].str.split(":").str[0].isin( + filter_property_types + ) + ) & + ( + survey_results_with_original_features["Wall Type"].str.split(":").str[0] == + property["Wall Type"].split(":")[0] + ) & + ( + survey_results_with_original_features["Roof Type"].str.split(":").str[0] == + property["Roof Type"].split(":")[0] + ) + ].copy() + + if "Electric" in property["Heating"]: + # Take other electric heating systems + surveyed = surveyed[surveyed["Heating"].str.contains("Electric")] + elif property["Heating"] in [ + "Community Heating Systems: Community boilers only (RdSAP)", + "Community Heating Systems: Community CHP and boilers (RdSAP)" + ]: + # Take other community heating systems + surveyed = surveyed[surveyed["Heating"].str.contains("Community")] + elif property["Heating"] == 'Heat Pump: (from database)': + # Take other heat pumps + surveyed = surveyed[surveyed["Heating"].str.contains("Heat Pump")] + elif property["Heating"] == "Solid fuel room heaters: Open fire in grate": + # Take other properties with room heaters + surveyed = surveyed[surveyed["Heating"].str.contains("room heaters")] + elif "Boiler" in property["Heating"]: + # Take other properties with boilers + surveyed = surveyed[surveyed["Heating"].str.contains("Boiler")] + else: + raise Exception("Fix me") + + if surveyed.empty: + final_missed_matches.append( + { + "Address ID": a_id, + "Confidence Tier": "4 - no similar property, needs survey to confirm", + "Current EPC Band": "Needs Survey", + "Current SAP Rating": "Needs Survey", + 'Survey: Main Wall Type': "Not Surveyed", + "Survey: Main Alternative Wall": "Not Surveyed", + "Survey: Main Roof Type": "Not Surveyed", + "Survey: Primary Heating System": "Not Surveyed", + "Survey: Matching Address ID": "Not Surveyed", + 'Distance to Closest Match (m)': 9999999, + "Package Ref": "Not Surveyed", + } + ) + continue + + # Calculate distance + surveyed["distance_meters"] = haversine( + lat1=property["latitude"], lon1=property["longitude"], + lat2=surveyed["latitude"].values, lon2=surveyed["longitude"].values + ) + surveyed = surveyed.sort_values("distance_meters", ascending=True) + + # Check if we have a postcode match check if surveyed postcode is the same as the property postcode + if any(surveyed["Postcode"] == property["Postcode"]): + surveyed = surveyed[surveyed["Postcode"] == property["Postcode"]] + + if any(surveyed["Postal Region"] == property["Postal Region"]): + surveyed = surveyed[surveyed["Postal Region"] == property["Postal Region"]] + + # Take the 3 nearest + surveyed = surveyed.head(3) + + # perform a weighted mean of SAP rating - the closer the better + expected_sap = np.average( + surveyed["Current SAP Rating"], weights=1 / (surveyed["distance_meters"] + 1) + ) + expected_epc = sap_to_epc(expected_sap) + + if expected_epc in ["C", "B", "A"]: + match_type = "5 - EPC C or above" + + closest_match = surveyed.iloc[0] + + # The closest property may be an EPC C, we we take the package ref from the property that's the nearest + # with non-NA package ref + if expected_epc in ["C", "B", "A"]: + package_ref = None + else: + package_ref = surveyed["Package Ref"].dropna().values[0] + + final_missed_matches.append( + { + "Address ID": a_id, + "Confidence Tier": match_type, + "Current EPC Band": expected_epc, + "Current SAP Rating": expected_sap, + 'Survey: Main Wall Type': closest_match["Survey: Main Wall Type"], + "Survey: Main Alternative Wall": closest_match["Survey: Main Alternative Wall"], + "Survey: Main Roof Type": closest_match["Survey: Main Roof Type"], + "Survey: Primary Heating System": closest_match["Survey: Primary Heating System"], + "Survey: Matching Address ID": closest_match["Address ID"], + 'Distance to Closest Match (m)': closest_match["distance_meters"], + "Package Ref": package_ref + } + ) + continue + + final_missed_matches = pd.DataFrame(final_missed_matches) + + region_assets = region_assets.merge( + final_missed_matches, + on="Address ID", + how="left", + suffixes=("", "_method3") + ) + + region_assets["Confidence Tier"] = region_assets["Confidence Tier"].fillna( + region_assets["Confidence Tier_method3"] + ) + + region_assets = fill_survey_columns(region_assets, suffix="_method3") + + method_3_columns = [c for c in region_assets.columns if c.endswith("_method3")] + region_assets = region_assets.drop(columns=method_3_columns) + + if pd.isnull(region_assets["Current EPC Band"]).sum(): + raise Exception("Something went wrong") + + results.append(region_assets) + + results = pd.concat(results) + + if (pd.isnull(results["Package Ref"]) & (~results["Current EPC Band"].isin(["A", "B", "C"]))).sum(): + raise ValueError("Missing Package Refs") + + # Check if there are missings in current epc band, current sap rating or any of the survey attributes + for c in ( + [ + "Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] + + survey_attribute_columns + ): + if pd.isnull(results[c]).sum(): + raise Exception("Something went wrong") + + gain_columns = sorted([x for x in results["Confidence Tier"].unique() if "1 - " in x or "2 - " in x or "3 - " in x]) + loss_columns = sorted([x for x in results["Confidence Tier"].unique() if "4 - " in x or "5 - " in x]) + + def optimise(gain, loss, max_loss=250): + + # Define the coefficients for the objective function (negative because we maximize Gain) + c = -gain + + # Define constraints + A = [loss] # Only 1 constraint for now, total Loss + b = [max_loss] # Maximum total Loss allowed + + # Bounds for each variable (select or not select each row, 0 <= x <= 1) + bounds = [(0, 1) for _ in gain] + + # Solve the problem using linprog with HiGHS solver + result = linprog(c, A_ub=A, b_ub=b, bounds=bounds, method='highs') + if not result.success: + raise Exception("Optimization failed") + + selected_rows = result.x.round().astype(int) # Rounded to 0 or 1 + optimal_gain = -result.fun + + return selected_rows, optimal_gain + + street_summary = results.pivot_table( + index='Street and Region', + columns='Confidence Tier', + aggfunc='size', + fill_value=0 + ).reset_index() + + street_summary["Gain"] = street_summary[gain_columns].sum(axis=1) + street_summary["Loss"] = street_summary[loss_columns].sum(axis=1) + + selected_rows, _ = optimise( + gain=street_summary["Gain"].values, + loss=street_summary["Loss"].values, + max_loss=250 + ) + + street_summary["Selected"] = selected_rows == 1 + print(street_summary[street_summary["Selected"]][["Gain", "Loss"]].sum()) + + selected_streets = street_summary[ + street_summary["Selected"] + ] + + totals = selected_streets[["Gain", "Loss"]].sum() + + bid_size = totals.sum() + print("Bid Size:", bid_size) + total_epc_d_or_below = totals["Gain"] + print("Total EPC D or below:", total_epc_d_or_below) + total_epc_c = totals["Loss"] + print("Total EPC C or above:", total_epc_c) + # Total needing a survey + total_needing_survey = selected_streets[ + "4 - no similar property, needs survey to confirm" + ].sum() + print("Total needing survey:", total_needing_survey) + + # Label final outputs + # We create a summary of packages by street + results["Package Ref"] = results["Package Ref"].fillna("EPC C - No Package") + results["Package Ref"] = results["Package Ref"].astype(str) + results["Package Ref"] = np.where( + results["Package Ref"] == "4.0", "4", results["Package Ref"] + ) + package_summary = results.pivot_table( + index='Street and Region', + columns='Package Ref', + aggfunc='size', + fill_value=0 + ).reset_index() + + assert sum([v for k, v in package_summary.sum().items() if k != "Street and Region"]) == results.shape[0] + + street_bid_structure = street_summary.merge( + package_summary, how="left", on="Street and Region" + ) + street_bid_structure = street_bid_structure.sort_values("Gain", ascending=False) + + individual_units_programme = results.copy() + individual_units_programme["Unit in Programme"] = individual_units_programme["Street and Region"].isin( + street_bid_structure[street_bid_structure["Selected"]]["Street and Region"].values + ) + + # Merge on Stonewaters ID + asset_list_ids = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " + "- Archetyped V3.1.xlsx", + header=4 + )[["Address ID", "Org. ref."]] + # Clean address ids + asset_list_ids = asset_list_ids[~pd.isnull(asset_list_ids["Address ID"])] + asset_list_ids = asset_list_ids[asset_list_ids["Address ID"] != "Address ID"] + asset_list_ids["Address ID"] = asset_list_ids["Address ID"].astype(int) + + individual_units_programme = individual_units_programme.merge( + asset_list_ids.rename( + columns={"Org. ref.": "Survey: Org. ref.", "Address ID": "Survey: Matching Address ID"} + ), + how="left", + on="Survey: Matching Address ID" + ) + + individual_units_programme["Survey: Org. ref."] = np.where( + (individual_units_programme["Survey: Matching Address ID"] == "Not Surveyed"), + "Not Surveyed", + individual_units_programme["Survey: Org. ref."] + ) + + if pd.isnull(individual_units_programme["Survey: Org. ref."]).sum() or pd.isnull( + individual_units_programme["Org. ref."]).sum(): + raise ValueError("something went wrong") + + for col in ["Survey: Main Roof Type", "Survey: Main Wall Type", "Survey: Main Alternative Wall"]: + individual_units_programme[col] = ( + individual_units_programme[col] + .str.replace(r': nan(?=$|:)', '', regex=True) # Remove ': nan' at the end or before another ':' + .str.replace(r':\s+:', ': ', regex=True) # Replace occurrences of ': :' with ': ' + .str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space + .str.strip() # Strip leading/trailing spaces + ) + + # Any EPC C properties that have been included should be flagged as potential low carbon heating + selected_epc_c = individual_units_programme[ + (individual_units_programme["Current EPC Band"].isin(["C", "B", "A", "Needs Survey"])) & + (individual_units_programme["Unit in Programme"]) + ] + + flat_wall_map = { + "CA Cavity: F Filled Cavity": False, + "CA Cavity: A As Built": True, + "SO Solid Brick: A As Built": True, + "Not Surveyed": False + } + + heating_map = { + "BGW Post 98 Combi condens. with auto ign.": False, + "BGB Post 98 Regular condens. with auto ign.": False, + "SEK High heat retention storage heaters": False, + "SEB Modern slimline storage heaters": True, + "Not Surveyed": False + } + + infill_data = [] + for _, epc_c_property in selected_epc_c.iterrows(): + if epc_c_property["Property Type"].split(":")[0] == "Flat": + # Look for a wall insulation measure + infill = flat_wall_map[epc_c_property["Survey: Main Wall Type"]] + infill_data.append( + { + "Address ID": epc_c_property["Address ID"], + "Street and Region": epc_c_property["Street and Region"], + "Possible Flat Infill?": infill + } + ) + continue + + infill = heating_map[epc_c_property["Survey: Primary Heating System"]] + infill_data.append( + { + "Address ID": epc_c_property["Address ID"], + "Street and Region": epc_c_property["Street and Region"], + "Low Carbon Heating Infill?": infill + } + ) + infill_data = pd.DataFrame(infill_data) + + individual_units_programme = individual_units_programme.merge( + infill_data[["Address ID", 'Possible Flat Infill?', 'Low Carbon Heating Infill?']], + how="left", on="Address ID" + ) + + for c in ['Possible Flat Infill?', 'Low Carbon Heating Infill?']: + individual_units_programme[c] = individual_units_programme[c].fillna(False) + + infill_by_street = infill_data.pivot_table( + index='Street and Region', + values=['Possible Flat Infill?', 'Low Carbon Heating Infill?'], + aggfunc='sum', + fill_value=0 + ).reset_index() + + street_bid_structure = street_bid_structure.merge( + infill_by_street, how="left", on="Street and Region" + ) + + for c in ['Low Carbon Heating Infill?', 'Possible Flat Infill?']: + street_bid_structure[c] = street_bid_structure[c].fillna(0) + + master_sheet = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - " + "master " + "sheet.csv", + encoding='latin1' + ) + master_sheet = master_sheet[["Address ID", "Main Fuel"]] + + individual_units_programme = individual_units_programme.merge( + master_sheet, how="left", on="Address ID" + ) + + street_bid_structure.to_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure V2.csv"), index=False + ) + + individual_units_programme.to_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv"), index=False + ) + + survey_results = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.19 V2.xlsx"), + header=13, + sheet_name="Modelled Packages" + ) + + indivual_units = pd.read_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv") + ) + + u_aids = survey_results["Archetype ID"].astype(str).unique() + units_in_bid = indivual_units[indivual_units['Unit in Programme']]["Archetype ID"].astype(str).values + + len({v for v in units_in_bid if str(v) in u_aids}) + len(list(set(units_in_bid))) + + +def identify_incorrect_packages(): + """ + Due to limitations in the data collected during survey, we have some properties that do not have suitable packages + assigned. This function will identify those properties, which can be flagged for Stonewater's review + """ + + units_with_assigned_packages = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.20 V2.xlsx"), + header=2, + sheet_name="Individual Units Programme" + ) + + # This sheet contains information on the heating systems for properties, so we can flag any units that have + # been labelled as being electric but are actually gas + heating_survey_data = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "STOCKBOOK December 2024 data (5).xlsx"), + header=0, + sheet_name="Export" + ) + + units_with_assigned_packages = units_with_assigned_packages.merge( + heating_survey_data[["Asset Reference", "Heating Type"]], how="left", + left_on="Org. ref.", right_on="Asset Reference" + ) + + # Check the different heating types + units_with_assigned_packages["Gas properties: different to Parity"] = ( + ( + units_with_assigned_packages["Heating Type"].isin(["Gas", "Communal Gas"]) + ) & ( + units_with_assigned_packages["Heating"].isin( + [ + "Heat Pump: Electric Heat " + "pumps: Air source heat pump " + "with flow temperature <= 35°C", + "Electric Storage Systems: Fan " + "storage heaters", + "Electric (direct acting) room " + "heaters: Panel, convector or " + "radiant heaters" + ] + ) + ) + ) + + units_with_assigned_packages["Electric properties: different to Parity"] = ( + (units_with_assigned_packages["Heating Type"] == "Electric") & ( + units_with_assigned_packages["Heating"].isin( + [ + "Boiler: A rated Regular Boiler", + "Boiler: F rated Combi", + "No Heating", + "Boiler: A rated CPSU", + "Boiler: G rated Regular Boiler" + ] + ) + ) + ) + + units_with_assigned_packages["Ground Source properties: different to Parity"] = ( + (units_with_assigned_packages["Heating Type"] == "Ground Source") & ( + units_with_assigned_packages["Heating"].isin( + [ + "Heat Pump: Electric Heat pumps: Air source heat pump with flow temperature <= 35°C", + "Electric Storage Systems: Fan storage heaters", + "Electric Storage Systems: High heat retention storage heaters" + ] + ) + ) + ) + + units_with_assigned_packages["LPG properties: different to Parity"] = ( + (units_with_assigned_packages["Heating Type"] == "Lpg") & ( + units_with_assigned_packages["Main Fuel"].isin( + [ + "Gas: Mains Gas", "Solid Fuel: Wood Logs, Gas: Mains Gas" + ] + ) + ) + ) + + units_with_assigned_packages["Solid Fuel properties: different to Parity"] = ( + (units_with_assigned_packages["Heating Type"] == "Solid Fuel") & ( + units_with_assigned_packages["Main Fuel"].isin( + [ + "Gas: Mains Gas" + ] + ) + ) + ) + + # The next check is to identify properties with specific features that are not condusive to specific packages. E.g. + # Solar PV packages for properties that have another dwelling above + # Label properties that have been matched to a package, during coordination, that includes Solar PV and has + # a property with a dwelling above + units_with_assigned_packages["Invalid Roof Type for Solar - coordination to be reviewed"] = ( + (units_with_assigned_packages["Package Ref"].isin(["3A", "3B", "4", 4])) & ( + units_with_assigned_packages["Survey: Main Roof Type"].str.contains("A Another dwelling above") + ) + ) + + # Label properties that have a dwelling above in the Parity data, and weren't surveyed, but have been assigned + # a package that includes solar PV + units_with_assigned_packages["Invalid Roof Type for Solar - coordination to be reviewed"] = ( + (units_with_assigned_packages["Package Ref"].isin(["3A", "3B", "4", 4])) & ( + units_with_assigned_packages["Survey: Main Roof Type"].str.contains("A Another dwelling above") + ) + ) + + # We now iterate through postcodes and find anomalous properties based on the partiy data and survey data + fields_to_check = [ + 'Wall Type Category', + # 'Roof Type Category', - not very interesting + 'Heating', + 'Main Fuel', + 'Survey: Main Wall Type', + # 'Survey: Main Roof Type', + 'Survey: Primary Heating System' + ] + + units_with_assigned_packages['Wall Type Category'] = units_with_assigned_packages['Wall Type'].str.replace( + r'\s*\(.*?\)', '', regex=True + ) + + # Create roof type category by splitting in colon and taking the first part + units_with_assigned_packages['Roof Type Category'] = units_with_assigned_packages['Roof Type'].str.split(':').str[0] + + units_with_assigned_packages["Street, Region and Postcode"] = ( + units_with_assigned_packages["Street and Region"] + ", " + units_with_assigned_packages["Postcode"] + ) + + def check_mixed_types(row): + # Count distinct primary types with non-zero values + primary_types_present = set() + for col in field_counts.columns: + if ':' in col: + primary_type = col.split(':')[0] + if row[col] > 0: # Non-zero count means this type is present + primary_types_present.add(primary_type) + return len(primary_types_present) > 1 # True if more than one primary type + + aggregated_results = {} + for field in fields_to_check: + # Group by postcode and count occurrences of each unique value + field_counts = ( + units_with_assigned_packages.groupby(['Street, Region and Postcode', field]) + .size() + .unstack(fill_value=0) + .reset_index() + ) + + # Calculate dominant value and percentage before modifying the DataFrame + dominant_value = field_counts.iloc[:, 1:].idxmax(axis=1) + dominant_percentage = ( + (field_counts.iloc[:, 1:].max(axis=1) / field_counts.iloc[:, 1:].sum(axis=1)) * 100 + ) + number_of_properties = field_counts.iloc[:, 1:].sum(axis=1) + + # Add these as new columns after computation + field_counts['Dominant Value'] = dominant_value + field_counts['% Dominant'] = dominant_percentage + field_counts['Number of Properties'] = number_of_properties + field_counts['Mixed Type'] = field_counts.apply(check_mixed_types, axis=1) + + # Store the result in the dictionary + aggregated_results[field] = field_counts + + # Let's fetch the EPC data + # Read in the existing EPC data we stored + import json + from utils.s3 import read_from_s3, read_pickle_from_s3 + def read_epc_data(): + epc_data = json.loads( + read_from_s3( + bucket_name="retrofit-data-dev", + s3_file_name="customers/Stonewater/clustering/epc_data.json" + ) + ) + epc_data = pd.DataFrame(epc_data) + + epc_data["uprn"] = np.where( + epc_data["internal_id"] == 1091, + 83143766, + epc_data["uprn"] + ) + epc_data_batch_2 = read_pickle_from_s3( + s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl", + bucket_name="retrofit-data-dev" + ) + epc_data_batch_2 = pd.DataFrame(epc_data_batch_2) + + complete_epcs = pd.concat([epc_data, epc_data_batch_2]) + + return complete_epcs + + epc_data = read_epc_data() + # Get just the fields we want from the EPC: Uprn, Wall, Roof, Heating, Fuel, SAP Score, EPC Band, Date of EPC + epc_data_to_append = epc_data[ + [ + "uprn", "walls-description", "roof-description", "mainheat-description", "main-fuel", + "current-energy-efficiency", "current-energy-rating", "lodgement-date", + "estimated" + ] + ].rename( + columns={ + "uprn": "UPRN", + "walls-description": "EPC: Wall Type", + "roof-description": "EPC: Roof Type", + "mainheat-description": "EPC: Heating", + "mainfuel": "EPC: Main Fuel", + "current-energy-efficiency": "EPC: SAP Score", + "current-energy-rating": "EPC: EPC Band", + "lodgement-date": "EPC: Date of EPC", + "estimated": "EPC Estimated based on Nearby Properties" + } + ) + # Take non-estimated EPCs? + # epc_data_to_append = epc_data_to_append[epc_data_to_append["EPC Estimated based on Nearby Properties"] != True] + # Take the newest EPC per UPRN, based on lodgement date + epc_data_to_append = epc_data_to_append.sort_values("EPC: Date of EPC", ascending=False).drop_duplicates("UPRN") + + epc_data_to_append["EPC: Date of EPC"] = pd.to_datetime(epc_data_to_append["EPC: Date of EPC"]) + # Years since the EPC was lodged + epc_data_to_append["Years since EPC"] = (pd.Timestamp.now() - epc_data_to_append["EPC: Date of EPC"]).dt.days / 365 + epc_data_to_append = epc_data_to_append[epc_data_to_append["UPRN"] != ""] + epc_data_to_append["UPRN"] = epc_data_to_append["UPRN"].astype(int) + + units_with_assigned_packages = units_with_assigned_packages.merge( + epc_data_to_append, how="left", on="UPRN", + ) + + # Read in the wave 2.1 data + wave_2_data = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, "Stonewater 2.1 SAP Pre & Post.xlsx" + ), + header=3 + ) + # Remove any where the work is outstanding + wave_2_data = wave_2_data[wave_2_data["Retrofit Assessment"] == "Completed"] + wave_2_data = wave_2_data[~pd.isnull(wave_2_data["Package Approved (Client)"])] + wave_2_data["house_number"] = wave_2_data["Name"].apply(lambda x: SearchEpc.get_house_number(x, "")) + + # Filter postcodes in the units_with_assigned_packages, to find overlapping postcodes + related_to_wave_2 = units_with_assigned_packages[ + units_with_assigned_packages["Postcode"].isin( + wave_2_data["Post Code"].values + ) & ( + ~units_with_assigned_packages["Confidence Tier"].isin( + [ + "1 - same archetype, same postal region", "1 - property was surveyed" + ] + ) + ) + ] + + wave2_matches = [] + for _, home in related_to_wave_2.iterrows(): + # Get the related homes + assigned_wave_2_packages = wave_2_data[ + wave_2_data["Post Code"] == home["Postcode"] + ] + + if assigned_wave_2_packages.shape[0] != 1: + # In this case, we get the closest match based on door number + hn = SearchEpc.get_house_number(home["Name"], home["Postcode"]) + + assigned_wave_2_packages = assigned_wave_2_packages[ + abs(assigned_wave_2_packages["house_number"].astype(int) - int(hn)) == min( + abs(assigned_wave_2_packages["house_number"].astype(int) - int(hn))) + ] + + wave2_matches.append( + { + "UPRN": home["UPRN"], + "2.1 matched address": assigned_wave_2_packages["Name"].values[0], + "2.1 matched address: Package Ref": assigned_wave_2_packages["Package Approved (Client)"].values[0], + "2.1 matched address: Wall Insulation": assigned_wave_2_packages["Wall Insulation"].values[0], + "2.1 matched address: Loft Insulation": assigned_wave_2_packages["Loft Insulation"].values[0], + "2.1 matched address: Ventilation": assigned_wave_2_packages["Ventilation"].values[0], + "2.1 matched address: Windows": assigned_wave_2_packages["Windwos Upgrade"].values[0] + } + ) + + # Store each results to CSV + for field, df in aggregated_results.items(): + df.to_csv( + os.path.join(CUSTOMER_FOLDER_PATH, f"{field} - aggregated results.csv"), index=False + ) + + # Store units_with_assigned_packages + units_with_assigned_packages.to_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Units with assigned packages - with flags.csv"), index=False + ) + + +def extract_sharepoint_url(x): + if pd.isnull(x): + return "" + return "/".join(parse.urlparse( + x.split(" - http")[1] + ).path.replace("%20", " ").split("/")[-2:]) + + +def revised_model(): + """ + This function implements the revised model for Stonewater, where we are looking at new priority postcodes + This work was undertaken in January 2021. + """ + + # 1) Create the new list of properties + new_priority_postcodes = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Jan 2025 Project/Updated 2025 to 2030 " + "priority list.xlsx" + ) + + original_archetypes = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 " + "- Archetyped V3.1.xlsx", + header=4 + ) + original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])] + original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"] + original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int) + original_archetypes["UPRN"] = original_archetypes["UPRN"].astype("Int64").astype(str) + + wave_21_folder_name = "Wave 2.1 Surveys - 2" + + # Check if we have all of the addresses + missed = original_archetypes[ + ~original_archetypes["Address ID"].isin(new_priority_postcodes["Address ID"].values) + ]["Archetype ID"].unique() + + assert set(missed) == {'NOT PRIORITY POSTCODE', 'IN WAVE 2.1', 'EPC C OR ABOVE'} + + original_archetypes = original_archetypes[ + ["Address ID", "Archetype ID", "Archetype Group Rank", "UPRN"] + ] + + # Merge these archetypes on to the new priority postcodes + new_priority_postcodes = new_priority_postcodes.merge( + original_archetypes, how="left", on="Address ID" + ) + + # Basic check, should have no rows with missing Archetype ID, where + assert float(new_priority_postcodes[pd.isnull(new_priority_postcodes["Archetype ID"])]["Address ID"].isin( + original_archetypes["Address ID"] + ).sum()) == 0 + + # We pull together the survey data sheet + survey_folders = [] + + # Loop over each survey folder and list its contents + for i in range(1, NUM_FOLDERS + 1): + folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}") + if os.path.isdir(folder_path): # Check if folder exists + folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)] + survey_folders.extend(folder_contents) # Append contents to the master list + + wave_21_folders = [ + "1. Herefordshire", + "2. Bedfordshire", + "3. Wiltshire", + "4. Bournemouth", + "5. Coventry", + "6. West Sussex", + "7. Dorset", + "8. Cambridgeshire", + "9. Guildford", + "10. Little Island", + "11. CCS Dorset" + ] + + for wave_2_1_folder in wave_21_folders: + folder_path = os.path.join(CUSTOMER_FOLDER_PATH, wave_21_folder_name, wave_2_1_folder) + if os.path.isdir(folder_path): # Check if folder exists + folder_contents = [os.path.join(wave_21_folder_name, wave_2_1_folder, file) for file in + os.listdir(folder_path)] + survey_folders.extend(folder_contents) # Append contents to the master list + + # We now do a large pull of all of the data + extracted_data = [] + mtp_extracted_data = [] # Additional data to extract from the medium term plans + for survey_folder in tqdm(survey_folders): + survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder) + + # Check that the survey folder is actually a folder + if not os.path.isdir(survey_folder_path): + continue + + # List the folders inside of the survey folder + survey_subfolders = [ + name for name in os.listdir(survey_folder_path) + if os.path.isdir(os.path.join(survey_folder_path, name)) + ] + + # Check if there's a "retrofit assessment" folder + retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None) + + ra_folder = next( + (name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()), + None + ) + + mtp_folder = next( + (name for name in survey_subfolders if "mid-term" in name.lower() or "mtp" in name.lower()), + None + ) + if mtp_folder: + # We have a mid term plan: + mtp_folder_path = os.path.join(survey_folder_path, mtp_folder) + # Get the contents - files and not folder + mtp_contents = [ + os.path.join(mtp_folder, file) for file in os.listdir(mtp_folder_path) + if ".DS_Store" not in file and not os.path.isdir(os.path.join(mtp_folder_path, mtp_folder, file)) + ] + + has_v1 = [ + f for f in mtp_contents if "v1" in f.lower() or "/ss" in f.lower() + ] + + if has_v1: + # Then we go one level deeper + mtp_contents = [ + os.path.join(has_v1[0], f) for f in + os.listdir(os.path.join(survey_folder_path, has_v1[0])) + ] + + # We check the the IMA + for file_name in mtp_contents: + + filepath = os.path.join(survey_folder_path, file_name) + # We expect a pdf so try and parse it + try: + with open(filepath, "rb") as file: + reader = PyPDF2.PdfReader(file) + # Just the first page + text = reader.pages[0].extract_text() + + except Exception as e: + continue + + # We check if this is an IMA + ima_heading_search = re.search( + r"Improvement measure\s+Capital Cost\s+Lifetime of\s*\n\s*measureFuel saving\s*Lifetime fuel", text + ) + + is_ima = bool(ima_heading_search) + if not is_ima: + continue + + # Otherwise, extract: RIR, PV + pv_search = re.search(r"PV \(\d+Kwp\)", text) + has_pv = bool(pv_search) + pv_system = pv_search.group(0) if has_pv else None + + # We perform a second search for PV: + if pv_search is None: + pv_search = re.search("solar pv", text.lower()) + has_pv = bool(pv_search) + pv_system = "Solar PV" if has_pv else None + + rir_search = re.search(r"RIR \(\d+(\.\d+)?\)", text) + has_rir = bool(rir_search) + rir_spec = rir_search.group(0) if has_rir else None + + mtp_extracted_data.append({ + "survey_folder": survey_folder, + "has_pv": has_pv, + "PV System": pv_system, + "RIR Specification": rir_spec, + "has_rir": has_rir + }) + continue + + # If retrofit assessment folder exists, check if it has content + if retrofit_folder or ra_folder: + if retrofit_folder: + retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder) + else: + retrofit_folder_path = os.path.join(survey_folder_path, ra_folder) + + # Check if everything inside is a sub-folder and the number of folders is 2 + items = [item for item in os.listdir(retrofit_folder_path) if item != '.DS_Store'] + all_folders = [os.path.isdir(os.path.join(retrofit_folder_path, item)) for item in items] + if all(all_folders) and len(all_folders) == 2 and "Property Pics" in items: + # Get the folder that isn't Property Pics + retrofit_folder_path = os.path.join( + retrofit_folder_path, [item for item in items if item != "Property Pics"][0] + ) + + if os.listdir(retrofit_folder_path): # If not empty + summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path) + if summary_data: + summary_data = { + "survey_folder": survey_folder, + **summary_data, + } + extracted_data.append(summary_data) + continue + else: + # Then we have an empty Retrofit Assessment folder + continue + + # If no retrofit folder or it was empty, check files in survey_folder + summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) + if not summary_data: + if len(survey_subfolders) == 1: + survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0]) + summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path) + + if summary_data: + summary_data = { + "survey_folder": survey_folder, + **summary_data, + } + extracted_data.append(summary_data) + + retrofit_assessment_data = pd.DataFrame(extracted_data) + mtp_df = pd.DataFrame(mtp_extracted_data) + + # Save + # retrofit_assessment_data.to_csv( + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 5.csv"), index=False + # ) + # mtp_df.to_csv( + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 5.csv"), index=False + # ) + retrofit_assessment_data = pd.read_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 5.csv"), + ) + mtp_df = pd.read_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 5.csv"), + ) + + # There are a few duplicates we just manually drop + mtp_df = mtp_df.drop_duplicates() + mtp_df = mtp_df[ + ~(( + mtp_df["survey_folder"] == "Wave 2.1 Surveys - 2/1. Herefordshire/(043) Manor Fields 27" + ) & (~mtp_df["has_pv"])) + ] + + mtp_df = mtp_df[ + ~(( + mtp_df["survey_folder"] == "Wave 2.1 Surveys - 2/2. Bedfordshire/(147) Gilpin Close 5" + ) & (~mtp_df["has_pv"])) + ] + + # Remove some definite duplicates + dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].duplicated()]["Address"] + dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].isin(dupes)] + dupes = dupes.sort_values("Address") + # Get all of the folders that end with ROSS + to_drop = dupes[dupes["survey_folder"].str.endswith("ROSS")]["survey_folder"].unique().tolist() + + # Replace \n with "" + retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "") + + retrofit_assessment_data = retrofit_assessment_data[ + ~retrofit_assessment_data["survey_folder"].isin( + [ + "StonewaterSurveys_10/4 Beech Road, LUTON, LU1 1DP ROSS", + "StonewaterSurveys_2/135 Runley Road, LUTON, LU1 1TX ROSS", + "StonewaterSurveys_13/7 Saxon Road, LUTON, LU3 1JR ROSS" + ] + to_drop + ) + ] + + retrofit_assessments_data_columns = [ + 'Current SAP Rating', 'Current EPC Band', 'Primary Energy Use (kWh/yr)', + 'Primary Energy Use Intensity (kWh/m2/yr)', 'Number of Storeys', + 'Fuel Bill', 'Window Age Description', + 'Window Age Description Proportion (%)', + 'Secondary Window Age Description', + 'Secondary Window Age Description Proportion (%)', 'Number of Windows', + 'Total Number of Doors', 'Number of Insulated Doors', + 'Existing Primary Heating System', + 'Existing Primary Heating PCDF Reference', + 'Existing Primary Heating Controls', + 'Existing Primary Heating % of Heat', + 'Existing Secondary Heating System', + 'Existing Secondary Heating PCDF Reference', + 'Existing Secondary Heating Controls', + 'Existing Secondary Heating % of Heat', 'Secondary Heating Code', + 'Water Heating Code', 'Total Floor Area (m2)', + 'Total Ground Floor Area (m2)', 'RIR Floor Area', + 'Main Building Wall Area (m2)', 'First Extension Wall Area (m2)', + 'Number of Light Fittings', 'Number of LEL Fittings', + 'Number of fittings needing LEL', 'Main Roof Type', + 'Main Roof Insulation', 'Main Roof Insulation Thickness', + 'Main Wall Type', 'Main Wall Insulation', 'Main Wall Dry-lining', + 'Main Wall Thickness', 'Main Building Alternative Wall Type', + 'Main Building Alternative Wall Insulation', + 'Main Building Alternative Wall Dry-lining', + 'Main Building Alternative Wall Thickness', + 'Main Fuel', + 'Main Building Age Band', + ] + # For the columns in retrofit_assessments_data_columns, prefix all of them with Survey: + retrofit_assessments_data_columns_prefixed = ["Survey: " + x for x in retrofit_assessments_data_columns] + rename_dict = dict(zip(retrofit_assessments_data_columns, retrofit_assessments_data_columns_prefixed)) + retrofit_assessment_data = retrofit_assessment_data.rename(columns=rename_dict) + retrofit_assessment_data["Survey: Current EPC Band"] = ( + retrofit_assessment_data["Survey: Current SAP Rating"].apply(lambda x: sap_to_epc(x)) + ) + + # We can read in the data as needed + + # Next Step: Read in the coordinated measures and match to the extracted data + ############################################################ + # CCS + ############################################################# + ccs_coordination_sheet = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, + "Jan 2025 Project", + "CCS_Installation_Compliance_-_Stonewater_SHDF_2_1_1738228227.xlsx" + ), + header=4 + ) + ccs_postcodes = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "CCS_Installation_Compliance_CCS.xlsx" + ), + header=4 + ) + ccs_coordination_sheet = ccs_postcodes[['Name', 'Post Code', 'Asset ID', 'Asset ID.1']].merge( + ccs_coordination_sheet, how="left", on="Name" + ) + ccs_coordination_sheet = ccs_coordination_sheet[~pd.isnull(ccs_coordination_sheet["Name"])] + ccs_coordination_sheet["contractor"] = "CCS" + # We split ccs into two sections - the first being + ccs_coordination_removed_from_programme = ccs_coordination_sheet.tail(21) + ccs_coordination_sheet = ccs_coordination_sheet.head(87) + ccs_coordination = pd.concat([ccs_coordination_removed_from_programme, ccs_coordination_sheet]) + + ccs_coordination["folder_path"] = ccs_coordination["Sharepoint Link"].apply(lambda x: extract_sharepoint_url(x)) + + ############################################################ + # WATES + ############################################################# + wates_coordination_sheet = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "Stonewater_SAP_Installation_Compliance_1738229226.xlsx" + ), + header=4 + ) + wates_postcodes = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "Stonewater_SAP_Installation_Compliance_Vinci-Wates.xlsx" + ), + header=4 + ) + wates_postcodes = wates_postcodes[~pd.isnull(wates_postcodes["Post Code"])] + wates_coordination_sheet = wates_coordination_sheet.merge( + wates_postcodes[['Name', 'Post Code', 'Asset ID']].drop_duplicates(), how="left", on="Name" + ) + + wates_coordination_sheet["contractor"] = "Wates" + # Break into the different sites: + # Wiltshire + wates_coordination_sheet_wiltshere = wates_coordination_sheet.head(267) + wates_coordination_sheet_herefordshire = wates_coordination_sheet.iloc[271:332, :] + wates_coordination_sheet_coventry = wates_coordination_sheet.iloc[336:409, :] + wates_coordination_sheet_bedfordshire = wates_coordination_sheet.iloc[413:520, :] + wates_coordination_sheet_bournemouth = wates_coordination_sheet.iloc[524:567, :] + wates_coordination_sheet_cambridgeshire = wates_coordination_sheet.iloc[571:581, :] + wates_coordination_sheet_removed_from_programme = wates_coordination_sheet.iloc[586:926, :] + wates_coordination_sheet_abeyance = wates_coordination_sheet.iloc[930:972, :] + + wates_coordination = pd.concat( + [ + wates_coordination_sheet_wiltshere, + wates_coordination_sheet_herefordshire, + wates_coordination_sheet_coventry, + wates_coordination_sheet_bedfordshire, + wates_coordination_sheet_bournemouth, + wates_coordination_sheet_cambridgeshire, + wates_coordination_sheet_removed_from_programme, + wates_coordination_sheet_abeyance + ] + ) + # We correct the Asset ID for 34 Kempster Close + wates_coordination["Asset ID"] = np.where( + wates_coordination["Name"] == "34 Kempster Close", + "12005", + wates_coordination["Asset ID"] + ) + + # We fill the missing ids + missing_lookup = { + "4 Sydnall Fields": 31231, + "12 Sydnall Fields": 31239, + "12 Athena Gardens": 28061, + "49 Banner Lane": 41189, + "4 Jonathan Road": 41232, + "8 Jonathan Road": 41236, + "1 Jonathan Road": 41229, + "96 Taunton Way": 31417, + "94 Taunton Way": 31418, + "1 Lady Lane": 29430, + "10 Jonathan Road": 41283, + "21 Jonathan Road": 41246, + "12 Ashcroft Close": 26399 + } + for name, asset_id in missing_lookup.items(): + wates_coordination["Asset ID"] = np.where( + wates_coordination["Name"] == name, + asset_id, + wates_coordination["Asset ID"] + ) + + wates_coordination = wates_coordination[~pd.isnull(wates_coordination["Asset ID"])] + + wates_coordination["folder_path"] = wates_coordination["Sharepoint Folder"].apply( + lambda x: extract_sharepoint_url(x) + ) + + ############################################################ + # NEW 450 COORDINATED RETROFIT ASSESSMENTS + ############################################################# + features = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - " + "master sheet.csv", + encoding='latin1' + ) + features["Address ID"] = features["Address ID"].astype(str).astype(int) + features_to_merge = features[["Address ID", "Organisation Reference"]] + + retrofit_packages_board = pd.read_excel( + os.path.join( + CUSTOMER_FOLDER_PATH, + "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1732034933 Final 19.11.24.xlsx" + ), + header=4 + ) + retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])] + # Take just the rows that have been surveyed + retrofit_packages_board = retrofit_packages_board[ + retrofit_packages_board["RA"].isin(["Invoiced", "Completed"]) + ] + + retrofit_packages_board = retrofit_packages_board.merge( + features_to_merge, how="left", on="Address ID" + ) + + manual_filters = { + "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD", + "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG", + "2 Bromyard Road": "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ", + 'Flat 18, 1 Raglan Court': "StonewaterSurveys_13/60-3-18 Raglan Court, 1 Raglan Court-MK41 8QT", + '14 Raglan Court, 1 Devizes Avenue': 'StonewaterSurveys_12/55-3-14 Raglan Court, Devizes Avenue-MK41 8QT', + '19 South Road': 'StonewaterSurveys_4/19 The Oaks, South Road, SMETHWICK, B67 7BY', + 'Flat 12 Pelican Lane': 'StonewaterSurveys_1/121-3-Flat 12 Lynton Court, Pelican Lane-RG14 1NN', + 'Flat C, 44 St Leonards Avenue': 'StonewaterSurveys_11/427-2-44c St. Leonards Avenue-MK42 0RB', + '16 The Crescent, Kington': 'StonewaterSurveys_9/360-3-16 The Crescent-HR5 3AS', + '2 School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA', + '14 South Road': 'StonewaterSurveys_2/14 The Oaks, South Road, SMETHWICK, B67 7BY', + '1 Groves Street': 'StonewaterSurveys_4/19-5-1 Groves Street-SN2 2BW', + '2 Calshot Walk': 'StonewaterSurveys_3/156-3-2 Calshot Walk-MK41 8QS', + '21 Constitution Hill': 'StonewaterSurveys_1/112-11-21 Constitution Hill-BH14 0PX', + '22 Constitution Hill': 'StonewaterSurveys_4/185-8-22 Constitution Hill-BH14 0PX', + '2 Marches Cottages, School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA', + '26, Copthorn House, Brighton Road': 'StonewaterSurveys_15/133-1-26 Brighton Road-KT20 6BQ', + '4, Old St Marys, Ripley Lane': "StonewaterSurveys_15/433-3-4 Ripley Lane-KT24 6JG", + '1 Nelson House, Short Street': 'StonewaterSurveys_15/89-2-1 Short Street-GU11 1HX', + "18 Nelson House, Short Street": 'StonewaterSurveys_15/25-3- 18 Short Street- GU11 1HX', + '3 Nelson House, Short Street': 'StonewaterSurveys_2/138-1-3 Short Street-GU11 1HX', + '16, Copthorn House, Brighton Road': 'StonewaterSurveys_13/78-3-16 Brighton Road-KT20 6BQ', + '20 Nelson House, Short Street': 'StonewaterSurveys_15/89-1-20 Short Street-GU11 1HX', + '7 Croft Street': 'StonewaterSurveys_8/333-2-7 Croft Street-HR6 8LA' + } + + # We now match this retrofit packages board to the extracted data + matching_lookup = [] + for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)): + + # Handle the case that has the wrong postcode in the asset data + if home["Name"] in manual_filters: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["survey_folder"] == manual_filters[home["Name"]] + ].copy() + else: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower() + ].copy() + + # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False + ) + if to_filter.sum() == 0: + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.replace(",", "").str.replace(".", + "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False + ) + filtered = filtered[to_filter] + + if filtered.empty: + continue + + if filtered.shape[0] == 1: + matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Address ID": home["Address ID"], + "Name": home["Name"] + } + ) + continue + + # home["Name"] should be contained in the survey_folder + filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)] + # We have an edge case wher some properties have two outputs in Sharepoint + if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ": + raise Exception("Fix me1") + # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"] + + if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB': + raise Exception("Fix me2") + # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"] + + if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ': + filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"] + + if filtered.empty: + continue + if filtered.shape[0] != 1: + raise Exception("something went wrong") + + matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Address ID": home["Address ID"], + "Name": home["Name"] + } + ) + matching_lookup = pd.DataFrame(matching_lookup) + + ccs_coordination = ccs_coordination.rename( + columns={"Post Code": "Postcode"} + ) + ccs_coordination = ccs_coordination[~pd.isnull(ccs_coordination["Postcode"])] + ccs_coordination = ccs_coordination[ccs_coordination["Retrofit Assessment"] != "Outstanding"] + + ccs_manual_filters = { + "35 Kittiwake Close": f"{wave_21_folder_name}/11. CCS Dorset/Kittiwake Close 35" + } + ccs_matching_lookup = [] + for _, home in tqdm(ccs_coordination.iterrows(), total=len(ccs_coordination)): + + # Handle the case that has the wrong postcode in the asset data + if home["Name"] in ccs_manual_filters: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["survey_folder"] == ccs_manual_filters[home["Name"]] + ].copy() + else: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower() + ].copy() + + # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["survey_folder"]. + str.replace(r"[^\w\s]", ""). + str.replace(",", ""). + str.replace(".", ""). + str.contains( + home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False + ) + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").str.lower() == + home["Name"].lower() + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["Address"].str.replace(" ,", "").str.split(",").str[0:1].str.join("").str.lower() == + home["Name"].lower() + ) + if to_filter.sum() == 0: + # Do a fuzzy match on the name + # Find the best filter + to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").apply( + lambda x: fuzz.partial_ratio(home["Name"], x) > 93 + ) + if to_filter.sum() == 0: + # We also some cases where the name of the survey folder is like "Colville Road 7" and the + # property name is actually 7 Colville Road, so we try taking the final part of the address, + # splitting on space, and adding it to the front + def reformat_survey_folder(x): + filename = x.split("/")[-1] + parts = filename.split(" ") + return " ".join(parts[-1:] + parts[:-1]) + + to_filter = ( + filtered["survey_folder"].apply(lambda x: reformat_survey_folder(x)).str.lower() == + home["Name"].lower() + ) + + if to_filter.sum() == 0: + raise Exception("Error") + filtered = filtered[to_filter] + + if filtered.empty: + continue + + if filtered.shape[0] == 1: + ccs_matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Asset ID.1": home["Asset ID.1"], + "Name": home["Name"] + } + ) + continue + + raise Exception("No match") + + ccs_matching_lookup = pd.DataFrame(ccs_matching_lookup) + # We get a match for all records + assert ccs_matching_lookup.shape[0] == ccs_coordination.shape[0] + assert not pd.isnull(ccs_matching_lookup["Asset ID.1"]).sum() + assert not ccs_matching_lookup["Asset ID.1"].duplicated().sum() + + # We do the same for Wates + wates_coordination = wates_coordination.rename( + columns={"Post Code": "Postcode"} + ) + wates_coordination = wates_coordination[ + wates_coordination["Retrofit Assessment"].isin(["Completed"]) + ] + wates_coordination = wates_coordination[ + ~pd.isnull(wates_coordination["Postcode"]) + ] + + wates_manual_filters = { + "24 Rabley Wood View": f"{wave_21_folder_name}/3. Wiltshire/24-25 Rabley Wood View", + "14 Edencroft": f"{wave_21_folder_name}/3. Wiltshire/14 Edencroft", + "Flat 31 Rabley Wood View": f"{wave_21_folder_name}/3. Wiltshire/Flat 31 Rabley Wood View", + 'Flat 13, Manor Fields': f'{wave_21_folder_name}/1. Herefordshire/(038) Manor Fields Flat 13', + "4 Kittys Lane": f"{wave_21_folder_name}/1. Herefordshire/(005) Kittys Lane 4", + '1 Jephson Court': f'{wave_21_folder_name}/5. Coventry/Jesphson Court 1', + '2 Jephson Court': f'{wave_21_folder_name}/5. Coventry/Jesphson Court 2', + } + wates_matching_lookup = [] + # Examples to skip when we cannot get the data + wates_to_skip = [ + "66 Abbatt Close", # File type is unusual, couldn't extract the data + "Flat 69 Goddard Road", # Doesn't exist + "19 Garth House", # # File type is unusual, couldn't extract the data + '5 Gilpin Close', # No properly formatted EPR + '49 The Hide, Netherfield', # TODO: TEMP HERE + '19 Chanders Rd', + '5 Chanders Rd', + '23 Chanders Rd', + '3 Chanders Rd', + '1 Orchard Close', + ] + wates_coordination = wates_coordination[~wates_coordination["Name"].isin(wates_to_skip)] + + for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)): + + # Search the folder + filtered = retrofit_assessment_data[ + retrofit_assessment_data["survey_folder"].str.contains(home["folder_path"], regex=False) + ] + if len(filtered) == 1: + wates_matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Asset ID": home["Asset ID"], + "Name": home["Name"] + } + ) + continue + + if home["Name"] in wates_to_skip: + continue + + # Handle the case that has the wrong postcode in the asset data + if home["Name"] in wates_manual_filters: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["survey_folder"] == wates_manual_filters[home["Name"]] + ].copy() + else: + filtered = retrofit_assessment_data[ + retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower() + ].copy() + + # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces + to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains( + home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False + ) + + if to_filter.sum() > 1: + to_filter = ( + filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.split("/").str[-1].str.lower() == + home["Name"].replace(r"[^\w\s]", "").lstrip().lower() + ) + + if to_filter.sum() == 0: + to_filter = ( + filtered["survey_folder"]. + str.replace(r"[^\w\s]", ""). + str.replace(",", ""). + str.replace(".", ""). + str.contains( + home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False + ) + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").str.lower() == + home["Name"].lower() + ) + if to_filter.sum() == 0: + to_filter = ( + filtered["Address"].str.replace(" ,", "").str.split(",").str[0:1].str.join("").str.lower() == + home["Name"].lower() + ) + if to_filter.sum() == 0: + # Do a fuzzy match on the name + # Find the best filter + to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").apply( + lambda x: fuzz.partial_ratio(home["Name"], x) > 93 + ) + if to_filter.sum() == 0: + # We also some cases where the name of the survey folder is like "Colville Road 7" and the + # property name is actually 7 Colville Road, so we try taking the final part of the address, + # splitting on space, and adding it to the front + def reformat_survey_folder(x): + filename = x.split("/")[-1] + parts = filename.split(" ") + return " ".join(parts[-1:] + parts[:-1]) + + to_filter = ( + filtered["survey_folder"].apply(lambda x: reformat_survey_folder(x)).str.lower() == + home["Name"].lower() + ) + + if to_filter.sum() == 0: + raise Exception("Error") + filtered = filtered[to_filter] + + if filtered.empty: + continue + + if filtered.shape[0] == 1: + wates_matching_lookup.append( + { + "survey_folder": filtered["survey_folder"].values[0], + "Asset ID": home["Asset ID"], + "Name": home["Name"] + } + ) + continue + + raise Exception("No match") + wates_matching_lookup = pd.DataFrame(wates_matching_lookup) + + # We get a match for all records + assert wates_matching_lookup.shape[0] == wates_coordination.shape[0] + assert not pd.isnull(wates_matching_lookup["Asset ID"]).sum() + assert not wates_matching_lookup["Asset ID"].duplicated().sum() + + # Merge lookup tables onto the coordination sheets + wates_coordination = wates_coordination.merge( + wates_matching_lookup, how="left", on="Name" + ) + missed_asset_id = wates_coordination[pd.isnull(wates_coordination["Asset ID_x"])] + if not missed_asset_id.empty: + raise Exception("Missing Asset ID") + + if wates_coordination["Asset ID_x"].duplicated().sum(): + raise Exception("Duplicated IDs in wates") + + # We merge the mpt data on to the wates coordination + wates_coordination = wates_coordination.merge( + mtp_df, how="left", on="survey_folder" + ) + + ccs_coordination = ccs_coordination.merge( + ccs_matching_lookup, how="left", on="Name" + ) + ccs_coordination = ccs_coordination.merge( + mtp_df, how="left", on="survey_folder" + ) + + retrofit_packages_board = retrofit_packages_board.merge( + matching_lookup, how="left", on="Name" + ) + + # We now map the retrofit assessment data to the coordinated packages + wates_coordination = wates_coordination.merge( + retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder" + ) + ccs_coordination = ccs_coordination.merge( + retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder" + ) + retrofit_packages_board = retrofit_packages_board.merge( + retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder" + ) + + # We have 4 properties in the Wates coordination board, that we want to remove from the retrofit packages board + to_remove = wates_coordination[ + wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"]) + ] + assert to_remove.shape[0] == 4 + # Remove them from the wates board + wates_coordination = wates_coordination[ + ~wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"]) + ] + + # We combine this into a singular board + coordinated_packages = pd.concat( + [ + retrofit_packages_board[ + [ + "Name", "Postcode", 'Actual SAP Band', 'Actual SAP Rating', + 'Modelled SAP Band', 'Modelled SAP Rating', 'Package Ref', + 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation', + 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', + 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', + 'Solar PV', 'Other measures', 'Organisation Reference', + ] + retrofit_assessments_data_columns_prefixed + ], + ccs_coordination[ + [ + # We don't have secondary wall insulation, Flat Roof, RIR, Heating Controls, + # Solar PV + "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package', + 'SAP Band Install Package', 'Package Approved (Client)', + 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', + 'Ventilation', 'Heating', 'Other Measures', 'PV System', + "Asset ID.1_y", + ] + retrofit_assessments_data_columns_prefixed + ].rename( + columns={ + "SAP Band Pre": "Actual SAP Band", + "SAP Rating Pre": "Actual SAP Rating", + 'SAP Rating Install Package': 'Modelled SAP Band', + 'SAP Band Install Package': 'Modelled SAP Rating', + 'Package Approved (Client)': 'Package Ref', + 'Wall Insulation': 'Main Wall Insulation', + 'Loft Insulation': 'Loft insulation', + 'Windows Upgrade': 'Window Upgrade', + 'Ext. Doors Upgrade': 'Door Upgrade', + 'Heating': 'Main Heating', + 'Other Measures': 'Other measures', + 'Asset ID.1_y': 'Organisation Reference', + "PV System": "Solar PV", + } + ), + wates_coordination[ + [ + "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package', + 'SAP Band Install Package', 'Package Approved (Client)', + 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade', + 'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x', "PV System" + ] + retrofit_assessments_data_columns_prefixed + ].rename( + columns={ + "SAP Band Pre": "Actual SAP Band", + "SAP Rating Pre": "Actual SAP Rating", + 'SAP Rating Install Package': 'Modelled SAP Band', + 'SAP Band Install Package': 'Modelled SAP Rating', + 'Package Approved (Client)': 'Package Ref', + 'Wall Insulation': 'Main Wall Insulation', + 'Loft Insulation': 'Loft insulation', + 'Windows Upgrade': 'Window Upgrade', + 'Ext. Doors Upgrade': 'Door Upgrade', + 'Heating': 'Main Heating', + 'Other Measures': 'Other measures', + 'Asset ID_x': 'Organisation Reference', + "PV System": "Solar PV", + } + ) + ] + ) + + coordinated_packages["Organisation Reference"] = coordinated_packages["Organisation Reference"].astype(int) + assert not coordinated_packages["Organisation Reference"].duplicated().sum() + + # Merge the property features on + coordinated_packages = coordinated_packages.merge( + features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]], + how="left", + on="Organisation Reference" + ) + + coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Survey: Current EPC Band"])] + coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Survey: Current SAP Rating"])] + + # We need the features pertaining to these priority postcodes + + def find_nearest_matching_property(coordinated_packages, home): + filter_levels = [ + (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2), + (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3), + (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 4), + (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 5), + (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 6), + (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 7), + ] + + max_confidence = max([confidence for (_, confidence) in filter_levels]) + + for i, (filters, match_confidence) in enumerate(filter_levels): + match = coordinated_packages.copy() + + for col in filters: + match = match[match[col] == home[col]] + + if not match.empty: + return match, match_confidence + + # Finally, we search for a property in the same Archetype + match = coordinated_packages[coordinated_packages["Archetype ID"] == home["Archetype ID"]] + if not match.empty: + return match, max_confidence + 1 + + return None, None # No match found + + coordinated_packages["Postal Region"] = coordinated_packages["Postcode"].str.split(" ").str[0].str.strip() + new_priority_postcodes["Postal Region"] = new_priority_postcodes["Postcode"].str.split(" ").str[0].str.strip() + + coordinated_packages["Roof Simple"] = coordinated_packages["Roofs"].str.split(":").str[0].str.strip() + new_priority_postcodes["Roof Simple"] = new_priority_postcodes["Roofs"].str.split(":").str[0].str.strip() + + coordinated_packages["Primary Property Type"] = coordinated_packages["Property Type"].str.split(":").str[0] + new_priority_postcodes["Primary Property Type"] = new_priority_postcodes["Property Type"].str.split(":").str[0] + + coordinated_packages = coordinated_packages.merge( + new_priority_postcodes[["Organisation Reference", "Archetype ID"]], + how="left", + on="Organisation Reference" + ) + + # For every property in the priority postcodes data, we look for a most appropriate matching property + no_match = [] + matches = [] + for _, home in tqdm(new_priority_postcodes.iterrows(), total=len(new_priority_postcodes)): + # We check if the property was surveyed + survey_result = coordinated_packages[ + coordinated_packages["Organisation Reference"] == home["Organisation Reference"] + ] + if not survey_result.empty: + to_extend = [ + { + "Organisation Reference": home["Organisation Reference"], + "Best Match Organisation Reference": m, + "match_confidence": 1, + "Was Surveyed": True + } for m in survey_result["Organisation Reference"].values + ] + matches.extend(to_extend) + continue + + closest_match, match_confidence = find_nearest_matching_property(coordinated_packages, home) + if closest_match is None: + no_match.append(home["Organisation Reference"]) + continue + + to_extend = [ + { + "Organisation Reference": home["Organisation Reference"], + "Best Match Organisation Reference": m, + "match_confidence": match_confidence, + "Was Surveyed": False + } for m in closest_match["Organisation Reference"].values + ] + matches.extend(to_extend) + + no_match_summary = new_priority_postcodes[ + new_priority_postcodes["Organisation Reference"].isin( + no_match + ) + ].groupby(["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"])[ + "Organisation Reference"].count().reset_index() + + no_match_summary = no_match_summary.sort_values("Organisation Reference", ascending=False) + + # len(no_match) + # 8764, 5607, 5646, 5071 + # no_match_summary.shape + # (3953, 6), (2948, 6), (2969, 7), (2575, 7) + + matches_df = pd.DataFrame(matches) + + matches_df = matches_df.merge( + coordinated_packages[["Organisation Reference", "Survey: Current EPC Band", "Survey: Current SAP Rating"]], + left_on="Best Match Organisation Reference", right_on="Organisation Reference", + suffixes=("", " - Closest Match") + ) + + measures_columns = [ + 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation', + 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', + 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', + 'Solar PV', 'Other measures' + ] + + # We want to aggregate the matches, when we have multiple + aggregated_matches_df = [] + for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"): + + measures = coordinated_packages[ + ( + coordinated_packages["Organisation Reference"].isin( + mapped_matches['Best Match Organisation Reference'].values + ) + ) + ][measures_columns] + + if mapped_matches.shape[0] == 1: + # Get the measures for this property + measures = measures.squeeze() + + aggregated_matches_df.append( + { + "Organisation Reference": org_ref, + "Number of matches": 1, + "Proportion": 100, + "Estimated SAP Rating": mapped_matches["Survey: Current SAP Rating"].values[0], + "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0], + "Was Surveyed": mapped_matches["Was Surveyed"].values[0], + **measures + } + ) + continue + + # We need to aggregate the matches, since we have multiple + average_rating = mapped_matches["Survey: Current SAP Rating"].mean() + number_of_matches = mapped_matches.shape[0] + average_epc_rating = sap_to_epc(average_rating) + # proportion is the number of properties that have this EPC rating + proportion_with_this_epc = int( + mapped_matches[mapped_matches["Survey: Current EPC Band"] == average_epc_rating].shape[ + 0] / number_of_matches * 100 + ) + + measures_aggregated = {} + for m in measures_columns: + if any(~pd.isnull(measures[m])): + # Check if we have 2 unique values + vals = measures[~pd.isnull(measures[m])][m].unique() + if len(vals) > 1: + measures_aggregated[m] = ", ".join(vals) + else: + measures_aggregated[m] = vals[0] + + aggregated_matches_df.append( + { + "Organisation Reference": org_ref, + "Number of matches": number_of_matches, + "Proportion": proportion_with_this_epc, + "Estimated SAP Rating": average_rating, + "Estimated EPC Rating": average_epc_rating, + "Was Surveyed": False, + **measures_aggregated + } + ) + + aggregated_matches_df = pd.DataFrame(aggregated_matches_df) + + mapped_priority_list = new_priority_postcodes.merge( + aggregated_matches_df, on="Organisation Reference", how="left" + ) + + mapped_priority_list["address1"] = mapped_priority_list["Address"].str.split(",").str[0] + + # If we have a leading number like 01, 02, 03, 04, 05, 06, 07, 08, 09, we remove the leading 0 + + def remove_leading_zero(address): + return re.sub(r"^0([1-9]) ", r"\1 ", address) + + mapped_priority_list["address1"] = mapped_priority_list["address1"].apply(remove_leading_zero) + mapped_priority_list["address1"] = np.where( + mapped_priority_list["Organisation Reference"] == 37004, + "8 Mason Road", + mapped_priority_list["address1"] + ) + mapped_priority_list["address1"] = np.where( + mapped_priority_list["Organisation Reference"] == 37003, + "9 Mason Road", + mapped_priority_list["address1"] + ) + + mapped_priority_list = mapped_priority_list.rename( + columns={"UPRN": "uprn"} + ) + mapped_priority_list["row_id"] = mapped_priority_list["Organisation Reference"] + + # Flag where 2 out of the three columns have consensus + mapped_priority_list["2 of 3 Data Sources Have Consensus on EPC"] = ( + (mapped_priority_list["SAP Band"] == mapped_priority_list["EPC Band"]) | + (mapped_priority_list["SAP Band"] == mapped_priority_list["Estimated EPC Rating"]) | + (mapped_priority_list["EPC Band"] == mapped_priority_list["Estimated EPC Rating"]) + ) + + # Let's get the newest EPC data for these properties + # We merge on UPRN, when we have it + # from etl.route_march_data_pull.app import get_data + # epc_data, errors, nodata = get_data( + # asset_list=mapped_priority_list, + # fulladdress_column="Address", + # address1_column="address1", + # postcode_column="Postcode", + # manual_uprn_map={}, + # epc_api_only=True + # ) + # + # epc_df = pd.DataFrame(epc_data) + # epc_df.to_csv( + # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "full_epc_data.csv"), index=False + # ) + epc_df = pd.read_csv(os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "full_epc_data.csv")) + epc_df = epc_df.rename(columns={"row_id": "Organisation Reference"}) + + # We now package up the data + + # Sheet 1 is the base coordination data + output_coordination_sheet = coordinated_packages[ + [ + "Name", "Postcode", 'Organisation Reference', 'Package Ref', + 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation', + 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade', + 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls', + 'Solar PV', 'Other measures', + 'Survey: Current SAP Rating', + 'Survey: Current EPC Band', + 'Survey: Primary Energy Use (kWh/yr)', + 'Survey: Primary Energy Use Intensity (kWh/m2/yr)', + 'Survey: Number of Storeys', 'Survey: Fuel Bill', + 'Survey: Window Age Description', + 'Survey: Window Age Description Proportion (%)', + 'Survey: Secondary Window Age Description', + 'Survey: Secondary Window Age Description Proportion (%)', + 'Survey: Number of Windows', 'Survey: Total Number of Doors', + 'Survey: Number of Insulated Doors', + 'Survey: Existing Primary Heating System', + 'Survey: Existing Primary Heating PCDF Reference', + 'Survey: Existing Primary Heating Controls', + 'Survey: Existing Primary Heating % of Heat', + 'Survey: Existing Secondary Heating System', + 'Survey: Existing Secondary Heating PCDF Reference', + 'Survey: Existing Secondary Heating Controls', + 'Survey: Existing Secondary Heating % of Heat', + 'Survey: Secondary Heating Code', 'Survey: Water Heating Code', + 'Survey: Total Floor Area (m2)', 'Survey: Total Ground Floor Area (m2)', + 'Survey: RIR Floor Area', 'Survey: Main Building Wall Area (m2)', + 'Survey: First Extension Wall Area (m2)', + 'Survey: Number of Light Fittings', 'Survey: Number of LEL Fittings', + 'Survey: Number of fittings needing LEL', 'Survey: Main Roof Type', + 'Survey: Main Roof Insulation', + 'Survey: Main Roof Insulation Thickness', 'Survey: Main Wall Type', + 'Survey: Main Wall Insulation', 'Survey: Main Wall Dry-lining', + 'Survey: Main Wall Thickness', + 'Survey: Main Building Alternative Wall Type', + 'Survey: Main Building Alternative Wall Insulation', + 'Survey: Main Building Alternative Wall Dry-lining', + 'Survey: Main Building Alternative Wall Thickness', + 'Survey: Main Fuel', + 'Survey: Main Building Age Band', + 'Walls', 'Roofs', 'Heating', 'Main Fuel', 'Age', 'Property Type' + ] + ].rename( + columns={ + 'Walls': "Parity - Walls", + 'Roofs': "Parity - Roof", + 'Heating': "Parity - Heating", + 'Main Fuel': "Parity - Fuel", + 'Age': "Parity - Age Band", + 'Property Type': "Parity - Property Type" + } + ) + + # Sheet 2 is the lookup table which maps the properties to their closest match + # We need to bring in the parity attributes between the mapped properties so we can see side-by-side + mapped_lookup = matches_df[ + [ + 'Organisation Reference', + 'Best Match Organisation Reference', + 'Survey: Current EPC Band', + 'Survey: Current SAP Rating', + "Was Surveyed", + "match_confidence", + ] + ].rename( + columns={ + 'Best Match Organisation Reference': "Best Match - Organisation Reference", + "Survey: Current EPC Band": "Best Match - Survey: Current EPC Band", + 'Survey: Current SAP Rating': "Best Match - Survey: Current SAp Rating" + } + ).merge( + features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type", + "Total Floor Area"]], + how="left", + on="Organisation Reference" + ).merge( + features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type", + "Total Floor Area"]].rename( + columns={ + "Organisation Reference": "Best Match - Organisation Reference", + "Walls": "Best Match - Walls", + "Roofs": "Best Match - Roof", + "Heating": "Best Match - Heating", + "Main Fuel": "Best Match - Main Fuel", + "Age": "Best Match - Age", + "Property Type": "Best Match - Property Type", + "Total Floor Area": "Best Match - Total Floor Area" + } + ), + how="left", + on="Best Match - Organisation Reference" + ).merge( + coordinated_packages[ + [ + "Organisation Reference", 'Survey: Main Wall Type', 'Survey: Main Wall Insulation', + 'Survey: Main Roof Type', 'Survey: Main Roof Insulation', 'Survey: Main Roof Insulation Thickness', + 'Survey: Existing Primary Heating System', 'Survey: Main Building Age Band', + 'Survey: Main Building Wall Area (m2)', 'Survey: Total Floor Area (m2)', + 'Survey: Main Building Age Band', + ] + ].rename( + columns={ + "Organisation Reference": "Best Match - Organisation Reference", + 'Survey: Main Wall Type': 'Best Match - Survey: Main Wall Type', + 'Survey: Main Wall Insulation': 'Best Match - Survey: Main Wall Insulation', + 'Survey: Main Roof Type': 'Best Match - Survey: Main Roof Type', + 'Survey: Main Roof Insulation': 'Best Match - Survey: Main Roof Insulation', + 'Survey: Main Roof Insulation Thickness': 'Best Match - Survey: Main Roof Insulation Thickness', + 'Survey: Existing Primary Heating System': 'Best Match - Survey: Existing Primary Heating System', + } + ), + how="left", + on="Best Match - Organisation Reference" + ) + + # Finally, we have the property, against the mapped home with the estimate SAP scores and the EPC data + worksheet = mapped_priority_list[ + [ + 'Organisation Reference', 'Address', 'Postcode', 'Address ID', 'uprn', 'Archetype ID', + 'SAP', 'SAP Band', "Property Type", "Walls", "Roofs", 'Glazing', + 'Heating', 'Main Fuel', 'Hot Water', 'Number of matches', 'Proportion', + 'Estimated SAP Rating', 'Estimated EPC Rating', "Was Surveyed", + 'Main Wall Insulation', + 'Secondary Wall Insulation', 'Loft insulation', 'Flat Roof', + 'Room in Roof', 'Window Upgrade', 'Door Upgrade', 'Ventilation', + 'Main Heating', 'Water Heating', 'Heating Controls', 'Solar PV', + 'Other measures', "2 of 3 Data Sources Have Consensus on EPC" + ] + ].rename( + columns={ + "SAP": "Parity - SAP Rating", + "SAP Band": "Parity - EPC Rating", + "Property Type": "Parity - Property Type", + "Walls": "Parity - Walls", + "Roofs": "Parity - Roofs", + 'Glazing': "Parity - Glazing", + 'Heating': 'Parity - Heating', + 'Main Fuel': 'Parity - Main Fuel', + 'Hot Water': 'Parity - Hot Water', + 'Proportion': 'Proportion of matched properties with same EPC rating', + } + ).merge( + epc_df[ + [ + "Organisation Reference", + "uprn", + "current-energy-efficiency", + "current-energy-rating", + "lodgement-date", + "construction-age-band", + "walls-description", + "roof-description", + "mainheat-description", + "windows-description", + "hotwater-description", + "main-fuel", + "total-floor-area", + ] + ].rename( + columns={ + "uprn": "Last EPC - uprn", + "current-energy-efficiency": "Last EPC - SAP Score", + "current-energy-rating": "Last EPC - EPC Rating", + "lodgement-date": "Last EPC - Date Lodged", + "construction-age-band": "Last EPC - Age Band", + "walls-description": "Last EPC - Walls", + "roof-description": "Last EPC - Roof", + "mainheat-description": "Last EPC - Heating", + "windows-description": "Last EPC - Windows", + "hotwater-description": "Last EPC - Hot Water", + "main-fuel": "Last EPC - Main Fuel", + "total-floor-area": "Last EPC - Total Floor Area" + } + ), + how="left", + on='Organisation Reference' + ) + + worksheet["Years Since Last EPC"] = pd.Timestamp.now().year - pd.to_datetime( + worksheet["Last EPC - Date Lodged"]).dt.year + + worksheet["Last EPC - uprn"] = worksheet["Last EPC - uprn"].astype("Int64").astype(str) + + worksheet["uprn"] = np.where( + pd.isnull(worksheet["uprn"]) & pd.notnull(worksheet["Last EPC - uprn"]), + worksheet["Last EPC - uprn"], + worksheet["uprn"] + ) + + worksheet["uprn"] = worksheet["uprn"].replace("", "") + + worksheet = worksheet.drop(columns=["Last EPC - uprn"]) + + # Save to Excel with multiple sheets + excel_path = os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "13022025 Stonewater Priority List.xlsx") + with pd.ExcelWriter(excel_path, engine="xlsxwriter") as writer: + worksheet.to_excel(writer, sheet_name="Worksheet", index=False, header=True) + mapped_lookup.to_excel(writer, sheet_name="Lookup Table", index=False, header=True) + output_coordination_sheet.to_excel(writer, sheet_name="Coordination", index=False, header=True) + +# if __name__ == "__main__": +# main() diff --git a/etl/customers/stonewater/data_cleaning.py b/etl/customers/stonewater/data_cleaning.py new file mode 100644 index 00000000..eedae9b9 --- /dev/null +++ b/etl/customers/stonewater/data_cleaning.py @@ -0,0 +1,155 @@ +import os +import shutil +from tqdm import tqdm +from etl.access_reporting.app import SharePointClient + + +def delete_large_files(): + """ + This function deletes photos, designs and other files which we don't need + :return: + """ + + folder_path = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 2.1 Surveys" + + # List the contents of this folder since in each sub-folder we have the property folders + contents = os.listdir(folder_path) + + for subfolder in contents: + if not os.path.isdir(os.path.join(folder_path, subfolder)): + continue + subfolder_path = os.path.join(folder_path, subfolder) + # List the contents + property_folders = os.listdir(subfolder_path) + + for property in tqdm(property_folders): + # Check if it's a directory + if not os.path.isdir(os.path.join(subfolder_path, property)): + continue + + property_path = os.path.join(subfolder_path, property) + property_contents = os.listdir(property_path) + # We delete the contents of the following folders: + # '1. RA Property Pics' + # '4. Air Tightness Tests' + # '5. RD Design Info' + for folder_to_delete in ["1. RA Property Pics", "4. Air Tightness Tests", "5. RD Design Info", + "1. RA Property PIcs", "Post EPC Photos", "4. RD Design Info", + "5. Installer Info", "6. Trustmark lodgement", "7.Post Install Inspection Photos", + "6. Trustmark Lodgement", "7. Post Inspection Photos"]: + if folder_to_delete not in property_contents: + continue + folder_to_delete_path = os.path.join(property_path, folder_to_delete) + if os.path.isdir(folder_to_delete_path): + # Delete the folder, even if it's not empty + shutil.rmtree(folder_to_delete_path) + + # We now check the '2. RA Coordinator Info' folder for any .MOV files and delete them + if "2. RA Coordinator Info" not in property_contents: + coordinator_folder = "1. RA Coordinator Info" + else: + coordinator_folder = "2. RA Coordinator Info" + coordinator_info_path = os.path.join(property_path, coordinator_folder) + coordinator_info_contents = os.listdir(coordinator_info_path) + # Look for .MOV files and .jpg files + for file in coordinator_info_contents: + if file.endswith(".MOV"): + os.remove(os.path.join(coordinator_info_path, file)) + + if file.endswith(".jpg"): + os.remove(os.path.join(coordinator_info_path, file)) + + if "Property Pics" in coordinator_info_contents: + # Delete folder and contents + shutil.rmtree(os.path.join(coordinator_info_path, "Property Pics")) + + +def download_data_from_sharepoint(): + # Given a sharepoint location, this function will download the retrofit assessment folders from the locations + # specified in the sharepoint location + + SHAREPOINT_CLIENT_ID = os.getenv("SHAREPOINT_CLIENT_ID", None) + SHAREPOINT_CLIENT_SECRET = os.getenv("SHAREPOINT_CLIENT_SECRET", None) + SHAREPOINT_TENANT_ID = os.getenv("SHAREPOINT_TENANT_ID", None) + OSMOSIS_SHAREPOINT_SITE_ID = os.getenv("OSMOSIS_SHAREPOINT_SITE_ID", None) + + sharepoint_client = SharePointClient( + tenant_id=SHAREPOINT_TENANT_ID, + client_id=SHAREPOINT_CLIENT_ID, + client_secret=SHAREPOINT_CLIENT_SECRET, + site_id=OSMOSIS_SHAREPOINT_SITE_ID + ) + + # Retrieve the data from Sharepoint and write to local machine + contents = sharepoint_client.list_folder_contents( + drive_id=sharepoint_client.document_drive["id"], + folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" + ) + + folders_to_keep = [ + "1. Herefordshire", "2. Bedfordshire", "3. Wiltshire", "4. Bournemouth", + "5. Coventry", "6. West Sussex", "7. Dorset", "8. Cambridgeshire", + "9. Guildford", "10. Little Island", "11. CCS Dorset", + ] + + folders_to_pull = [ + folder for folder in contents["value"] if folder["name"] in folders_to_keep + ] + for folder_to_pull in folders_to_pull: + + # Get the contents + folder_contents = sharepoint_client.list_folder_contents( + drive_id=sharepoint_client.document_drive["id"], + folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" + "/" + + folder_to_pull["name"], + page_size=100 + ) + + property_folders = [f for f in folder_contents["value"]] + + for property_folder in property_folders: + # We go into each property folder and get the contents + property_folder_contents = sharepoint_client.list_folder_contents( + drive_id=sharepoint_client.document_drive["id"], + folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" + "/" + + folder_to_pull["name"] + "/" + property_folder["name"] + ) + if not property_folder_contents.get("value"): + continue + # We look for the retrofit assessment folder or mtp folders: + property_sub_folders = [ + f for f in property_folder_contents["value"] if + "ra coordinator info" in f["name"].lower() or + "retrofit assessment" in f["name"].lower() or + "ra info" in f["name"].lower() or + "mtp" in f["name"].lower() or + "mid-term" in f["name"].lower() + ] + + if not property_sub_folders: + continue + + for property_sub_folder in property_sub_folders: + # if we have this, we download the folder and store it on my laptop! + + property_folder_path = os.path.join( + "Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders", + folder_to_pull["name"], + property_folder["name"], + property_sub_folder["name"] + ) + + download_dir = os.path.join( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 2.1 Surveys - 2", + folder_to_pull["name"], + property_folder["name"], + property_sub_folder["name"] + ) + + # We download the folder + sharepoint_client.download_sharepoint_folder( + drive_id=sharepoint_client.document_drive["id"], + folder_path=property_folder_path, + download_dir=download_dir, + excluded_file_types=["MOV", "jpg"] + ) diff --git a/etl/customers/stonewater/potential_eco_properties.py b/etl/customers/stonewater/potential_eco_properties.py new file mode 100644 index 00000000..6666ce15 --- /dev/null +++ b/etl/customers/stonewater/potential_eco_properties.py @@ -0,0 +1,542 @@ +import os +import time +import json +import pandas as pd +import numpy as np +from tqdm import tqdm +from dotenv import load_dotenv +from backend.SearchEpc import SearchEpc +from utils.s3 import read_from_s3, read_pickle_from_s3 +import msoffcrypto +from io import BytesIO + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def get_data(asset_list): + epc_data = [] + errors = [] + for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)): + try: + postcode = home["Postcode"] + house_number = home["Number"] + full_address = home["Full Address"] + + searcher = SearchEpc( + address1=str(house_number), + postcode=postcode, + auth_token=EPC_AUTH_TOKEN, + os_api_key="", + property_type=None, + fast=True, + full_address=full_address, + max_retries=5 + ) + # Force the skipping of estimating the EPC + searcher.ordnance_survey_client.property_type = None + searcher.ordnance_survey_client.built_form = None + + searcher.find_property(skip_os=True) + if searcher.newest_epc is None: + continue + + # Look for EPC recommendatons + try: + property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"]) + except: + property_recommendations = {"rows": []} + + epc = { + "row_id": home["row_id"], + **searcher.newest_epc.copy(), + "recommendations": property_recommendations["rows"] + } + + epc_data.append(epc) + except Exception as e: + errors.append(home["row_id"]) + time.sleep(5) + + return epc_data, errors + + +def app(): + """ + This code creates a list of cavity properties, for review + """ + + # Read in the password protected master + # TODO: This file should be deleted! + + # Path to the password-protected Excel file + file_path = ("/Users/khalimconn-kowlessar/Downloads/STONEWATER MASTER SHEET - UPDATED 20.5.24 - K- PASSWORD " + "PROTECTED.xlsx") + password = "STONE123" # Replace with the actual password + + # Open the file and decrypt it + with open(file_path, "rb") as f: + decrypted_file = BytesIO() + office_file = msoffcrypto.OfficeFile(f) + office_file.load_key(password=password) + office_file.decrypt(decrypted_file) + + # Read the decrypted file into a DataFrame + eco_rolling_master = pd.read_excel(decrypted_file, sheet_name="Sheet1", engine="openpyxl") + + eco_rolling_master = eco_rolling_master[ + ~eco_rolling_master['INSTALL/CANCELLATION DATE'].str.contains("CANCELLED") + ] + + archetyped_properties = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 - " + "Archetyped V3.1.xlsx", + header=4 + ) + + cavity_descriptions = [ + "Cavity: AsBuilt (1983-1995)", + "Cavity: AsBuilt (Post 1995)", + "Cavity: AsBuilt (Pre 1976)", + "Cavity: AsBuilt (1976-1982)", + ] + + archetyped_properties["Is Cavity Property"] = archetyped_properties["Wall Type"].isin(cavity_descriptions) + # We also identify any properties where properties were found to need cavity wall insulation + + costed_packages = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater - Costed Retrofit Packages " + "20241030 (WIP) Single Model V2.xlsx", + sheet_name="Modelled Packages", + header=13 + ) + + needs_cwi = costed_packages[ + costed_packages["Main Wall Insulation"].isin( + [ + "Poss Extract CWI & Refill (issues identified)", + "CWI RdSAP Default" + ] + ) + ][["Address ID", "Address", "Current SAP Rating", "Current EPC Band", "Postcode", "Archetype ID", + "Main Wall Insulation", + "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness"]] + + # We flag these properties + archetyped_properties["Survey shows CWI needed for Archetype"] = archetyped_properties["Archetype ID"].isin( + needs_cwi["Archetype ID"] + ) + + archetyped_properties = archetyped_properties[~pd.isnull(archetyped_properties["Address ID"])] + archetyped_properties = archetyped_properties[archetyped_properties["Address ID"] != "Address ID"] + + # this is the big list!!! + features = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - " + "master sheet.csv", + encoding='latin1' + ) + features["Address ID"] = features["Address ID"].astype(str) + + features_to_merge = features[ + [ + "Address ID", "Organisation Reference", "Age", "Property Type", "Walls", "Roofs", "Glazing", "Heating", + "Main Fuel", + "Hot Water", + "Renewables", "Total Floor Area" + ] + ] + + stonewater_cavity_properties = archetyped_properties[ + ["Name", "Postcode", "Osm. ID", "Org. ref.", "Address ID", "UPRN", "UDPRN", "Archetype ID", "House no", + "Street name", + "Address line 2", "City/Town", "Is Cavity Property", "Survey shows CWI needed for Archetype"] + ].merge( + features_to_merge, how="left", on="Address ID" + ) + + # We filter this down to the properties that are cavity properties + stonewater_cavity_properties = stonewater_cavity_properties[ + stonewater_cavity_properties["Is Cavity Property"] | + stonewater_cavity_properties["Survey shows CWI needed for Archetype"] + ] + + stonewater_cavity_properties["Reason Included"] = "As Built Cavity Property" + stonewater_cavity_properties["Reason Included"] = np.where( + stonewater_cavity_properties["Survey shows CWI needed for Archetype"] & + ~stonewater_cavity_properties["Is Cavity Property"], + "Survey revealed potential need for CWI or extract and re-fill", + stonewater_cavity_properties["Reason Included"] + ) + stonewater_cavity_properties["Reason Included"] = np.where( + stonewater_cavity_properties["Survey shows CWI needed for Archetype"] & + stonewater_cavity_properties["Is Cavity Property"], + "Surveyed revealed potential need for CWI or extract and re-fill and is an as built cavity property", + stonewater_cavity_properties["Reason Included"] + ) + # We indicate the exact properties that need CWI, based on survey findings + stonewater_cavity_properties["Reason Included"] = np.where( + stonewater_cavity_properties["Address ID"].isin( + needs_cwi[needs_cwi["Main Wall Insulation"] == "CWI RdSAP Default"]["Address ID"].astype(int).astype( + str).values + ), + "Survey showed this property needs CWI", + stonewater_cavity_properties["Reason Included"] + ) + + stonewater_cavity_properties["Reason Included"] = np.where( + stonewater_cavity_properties["Address ID"].isin( + needs_cwi[needs_cwi["Main Wall Insulation"] == "Poss Extract CWI & Refill (issues identified)"][ + "Address ID"].astype(int).astype(str).values + ), + "Survey showed this property could need extract and re-fill", + stonewater_cavity_properties["Reason Included"] + ) + + # We flag units that were installed under ECO3 + numeric_ids = eco_rolling_master[eco_rolling_master["STONEWATER UPRN"] != "NOT ON ASSET LIST"] + numeric_ids = numeric_ids[~pd.isnull(numeric_ids["STONEWATER UPRN"])] + numeric_ids["STONEWATER UPRN"] = numeric_ids["STONEWATER UPRN"].astype(int) + + stonewater_cavity_properties["Installed under ECO3"] = stonewater_cavity_properties["Org. ref."].isin( + numeric_ids['STONEWATER UPRN'].values + ) + + # Which postcodes were installed under ECO3 + priority_list_eco3 = stonewater_cavity_properties[ + stonewater_cavity_properties["Installed under ECO3"] + ]["Postcode"].unique() + + # These are properties that were not installed under ECO3, that have the same postcodes as properties + # installed under ECO3 + + # These are 66 properties we might want to start with as an immediate priority + stonewater_cavity_properties["Same Postcode as Installed under ECO3"] = ( + ~stonewater_cavity_properties["Installed under ECO3"] & ( + stonewater_cavity_properties["Postcode"].isin(priority_list_eco3) + ) + ) + + stonewater_cavity_properties["UPRN"] = stonewater_cavity_properties["UPRN"].astype("Int64").astype(str) + # Find the postcodes where an Osmosis survey revealed a need for CWI + postcodes_found_needing_cwi = stonewater_cavity_properties[ + stonewater_cavity_properties["Reason Included"].isin( + [ + "Survey revealed potential need for CWI or extract and re-fill", + "Surveyed revealed potential need for CWI or extract and re-fill and is an as built cavity property", + "Survey showed this property needs CWI", + "Survey showed this property could need extract and re-fill" + ] + ) + ]["Postcode"].unique() + + stonewater_cavity_properties["Suspected Needs CWI - not surveyed"] = ( + ( + stonewater_cavity_properties[ + "Postcode"].isin( + postcodes_found_needing_cwi) + ) & ( + ~stonewater_cavity_properties[ + "Reason Included"].isin( + [ + "Survey revealed potential need " + "for CWI or extract and re-fill", + "Surveyed revealed potential " + "need for CWI or extract and " + "re-fill and is an as built " + "cavity property", + "Survey showed this property " + "needs CWI", + "Survey showed this property " + "could need extract and re-fill" + ] + ) + ) + ) + + # Merge the EPCs on, with the data we need + stonewater_cavity_properties = stonewater_cavity_properties.rename( + columns={ + "Age": "Parity - Build Age", + "Property Type": "Parity - Property Type", + "Walls": "Parity - Wall Construction", + "Roofs": "Parity - Roof Construction", + "Glazing": "Parity - Glazing Type", + "Heating": "Parity - Heating Type", + "Main Fuel": "Parity - Main Fuel", + "Hot Water": "Parity - Hot Water", + "Renewables": "Parity - Renewables", + "Total Floor Area": "Parity - Total Floor Area" + } + ) + + # We now flag the additional properties in the as built list + + additional_properties = features[ + ~features["Address ID"].isin(archetyped_properties["Address ID"].values) + ] + + # Filter on as built cavity properties + additional_properties = additional_properties[ + additional_properties["Walls"].isin(cavity_descriptions) + ] + additional_properties["Full Address"] = additional_properties["Address"].copy() + house_numbers = [] + for _, x in tqdm(additional_properties.iterrows(), total=len(additional_properties)): + house_no = SearchEpc.get_house_number(x["Address"].split(",")[0], x["Postcode"]) + if house_no is None: + house_no = x["Address"].split(",")[0] + # If we end up with a number like "01" we need to remove the leading zero + house_no = house_no.lstrip("0") + house_numbers.append( + { + "Address ID": x["Address ID"], + "Number": house_no + } + ) + + house_numbers = pd.DataFrame(house_numbers) + additional_properties = additional_properties.merge(house_numbers, how="left", on="Address ID") + additional_properties["row_id"] = additional_properties["Address ID"].copy() + + # Flag any units in this list that were installed under ECO3 + additional_properties["Installed under ECO3"] = additional_properties["Organisation Reference"].isin( + numeric_ids['STONEWATER UPRN'].values + ) + + # Additional list ECO3 + additional_list_eco3 = additional_properties[additional_properties["Installed under ECO3"]]["Postcode"].unique() + + # These are properties that were not installed under ECO3, that have the same postcodes as properties + # installed under ECO3 + # These are 297 properties we might want to start with as an immediate priority + additional_properties["Same Postcode as Installed under ECO3"] = ( + ~additional_properties["Installed under ECO3"] & ( + additional_properties["Postcode"].isin(additional_list_eco3) + ) + ) + + # We do some additional manual checks, for ECO3 properties that were installed that didn't get matched to either + # dataaset + numeric_ids["In asset list"] = numeric_ids["STONEWATER UPRN"].isin( + stonewater_cavity_properties['Org. ref.'].astype(int).values + ) + numeric_ids["In asset list"] = numeric_ids["In asset list"] | ( + numeric_ids["STONEWATER UPRN"].isin( + additional_properties['Organisation Reference'].astype(int).values + ) + ) + + # eco3_installs_not_in_asset_list = numeric_ids[~numeric_ids["In asset list"]] + # # We now take samples of properties randomly and manually check the ID against the asset list + # print(eco3_installs_not_in_asset_list.sample(1)[["STONEWATER UPRN", "Post Code", "NO ", "Street / Block Name", ]]) + # # Checked STONEWATER UPRN + # # 9862, BH15 1NR, 33, THE QUAY FOYER [x] + # # 12785, S01 66PN, 57, SEACOLE GARDENS [x] + # # 26071, MK42 0TE, 51, De Havilland Avenue, Shortstown [x] + # # 18213, HR6 9UW, 20 Ford Street [x] + # # 24344, LU4 9FF, 6 SEAL CLOSE [x] + # # 31222, SN14 0QZ, 7 HARDBROOK COURT [x] + # # 9343, SP4 7XL, 10 OAK PLACE [x] + # # 34730, LU5 5TN, 4 TUDOR DRIVE [x] + # # 7021, BN27 2BZ, 32 BUTTS FIELD [] + # + # stonewater_cavity_properties[stonewater_cavity_properties['Org. ref.'] == 7021] + # stonewater_cavity_properties[stonewater_cavity_properties['Postcode'] == "BN27 2BZ"]["Name"] + # + # additional_properties[additional_properties['Organisation Reference'] == 7021] + # additional_properties[additional_properties['Postcode'] == "BN27 2BZ"][["Address"]] + + # Pull the EPCs for these properties + # additional_properties_epcs, errors = get_data(additional_properties) + + # Save this data as a pickle + # import pickle + # with open("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/additional_properties_epcs.pkl", + # "wb") as f: + # pickle.dump(additional_properties_epcs, f) + + additional_properties["Suspected Needs CWI - not surveyed"] = ( + ( + additional_properties["Postcode"].isin(postcodes_found_needing_cwi) & + ~additional_properties["Installed under ECO3"] + ) + ) + + # We drop Full Address + additional_properties = additional_properties.drop(columns=["Full Address"]) + additional_properties2 = additional_properties[[ + "Address", "Postcode", "Address ID", "SAP", "SAP Band", "Property Type", "Walls", "Roofs", "Glazing", + "Heating", "Main Fuel", "Hot Water", "Renewables", "Total Floor Area", 'Installed under ECO3', + 'Same Postcode as Installed under ECO3', "Organisation Reference", + ]].rename( + columns={ + "Organisation Reference": "Org. ref.", + "SAP": "Parity - Predicted SAP", + "SAP Band": "Parity - Predicted SAP Band", + "Age": "Parity - Build Age", + "Property Type": "Parity - Property Type", + "Walls": "Parity - Wall Construction", + "Roofs": "Parity - Roof Construction", + "Glazing": "Parity - Glazing Type", + "Heating": "Parity - Heating Type", + "Main Fuel": "Parity - Main Fuel", + "Hot Water": "Parity - Hot Water", + "Renewables": "Parity - Renewables", + "Total Floor Area": "Parity - Total Floor Area" + } + ) + + # Combine the data: + + stonewater_cavity_properties2 = stonewater_cavity_properties.merge( + features[["Address", "Organisation Reference"]], how="left", on="Organisation Reference" + ) + full_dataset = pd.concat([stonewater_cavity_properties2, additional_properties2]) + full_dataset = full_dataset.drop(columns=['Osm. ID']) + + # We not define the priority list for non-intrusives + full_dataset["Postal Region"] = full_dataset["Postcode"].str.split(" ").str[0].str[0:2] + full_dataset["Postal Region 2"] = full_dataset["Postcode"].str.split(" ").str[0] + + # Strip out anything we definitely don't want + full_dataset = full_dataset[~full_dataset["Installed under ECO3"]] + + areas = full_dataset[full_dataset["Suspected Needs CWI - not surveyed"] == True]["Postal Region 2"].unique() + + priorities = full_dataset[ + full_dataset["Postal Region 2"].isin(areas) + ] + + region_prevalance = priorities["Postal Region 2"].value_counts().to_frame().reset_index() + region_prevalance = region_prevalance[region_prevalance["count"] > 100] + df = priorities[priorities["Postal Region 2"].isin(region_prevalance["Postal Region 2"].values)] + + df["Postal Region"].value_counts() + df["Postal Region 2"].value_counts() + + if df["Installed under ECO3"].sum(): + raise ValueError("There are properties in the priority list that were installed under ECO3") + + df.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives - " + "revised list.csv", + index=False + ) + + # We save the data locally + # stonewater_cavity_properties.to_csv( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties - priority " + # "postcodes.csv", + # index=False + # ) + # additional_properties2.to_csv( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties - " + # "non-priority postcodes.csv", + # index=False + # ) + # # Save the survey findings + # needs_cwi.to_csv( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - + # WIP.csv", + # index=False + # ) + + +def cross_reference_epc_programme(): + eco3_fallout = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/STONEWATER LIST OF ADDRESSES TO BE " + "SURVEYED - ECO3 NOT COMPLETED.xlsx" + ) + + for _, x in eco3_fallout.iterrows(): + house_no = SearchEpc.get_house_number(x["ADDRESS"], "") + if house_no is None: + house_no = x["ADDRESS"].split(",")[0] + x["house_number"] = house_no + + eco3_fallout["house_number"] = eco3_fallout.apply( + lambda x: SearchEpc.get_house_number(x["ADDRESS"], ""), axis=1 + ) + + # for _, x in eco3_fallout.ite + + stonewater_modelled_above_c = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - " + "master sheet.csv", + encoding='latin1' + ) + + stonewater_modelled_above_c["house_number"] = stonewater_modelled_above_c.apply( + lambda x: SearchEpc.get_house_number(x["Address"], x["Postcode"]), axis=1 + ) + + eco3_fallout_matched_to_above_c = [] + for _, property in eco3_fallout.iterrows(): + # Match on house number + match = stonewater_modelled_above_c[ + stonewater_modelled_above_c["house_number"] == property["house_number"] + ] + + # We do a fuzzy match on the address, with levenstein distance + + from fuzzywuzzy import fuzz + match = stonewater_modelled_above_c[ + stonewater_modelled_above_c["Address"].apply(lambda x: fuzz.ratio(x, property["ADDRESS"]) > 90) + ] + match.head() + + +def finalise_list_for_non_intrusives(): + non_intrusives_list = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/20250207 Stonewater " + "Non-Intrusives.xlsx" + ) + + # Remove anything installed under ECO3 + non_intrusives_list = non_intrusives_list[~non_intrusives_list["Installed under ECO3"]] + + # We make any properties that were surveyed by Osmosis + packages = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/Stonewater - Bid Packages WIP 14.11.20 V2 " + "(1).xlsx", + header=13, + sheet_name="Modelled Packages" + ) + + non_intrusives_list["Surveyed by Osmosis"] = non_intrusives_list["Address ID"].isin( + packages["Address ID"].values + ) + # Removed 54 addresses + final_non_intrusives = non_intrusives_list[ + ~non_intrusives_list["Surveyed by Osmosis"] + ] + + features = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - " + "master sheet.csv", + encoding='latin1' + ) + + # Add on the orgnisaion reference + final_non_intrusives = final_non_intrusives.merge( + features[["Organisation Reference", "Address ID"]], + how="left", + on="Address ID" + ) + + final_non_intrusives["Postal Region"] = final_non_intrusives["Postcode"].str.split(" ").str[0].str[0:2] + selected_regions = final_non_intrusives[ + final_non_intrusives["Include in non-intrusives"] + ]["Postcode"].unique() + + final_non_intrusives["Is in region"] = final_non_intrusives["Postcode"].isin(selected_regions) + + # Filter down: + final_non_intrusives = final_non_intrusives[ + final_non_intrusives["Is in region"] + ] + + final_non_intrusives.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives " + "List - final.xlsx") diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt new file mode 100644 index 00000000..09ba20bd --- /dev/null +++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt @@ -0,0 +1,11 @@ +PyPDF2 +pandas +tqdm +openpyxl +boto3 +epc-api-python==1.0.2 +usaddress==0.5.11 +fuzzywuzzy==0.18.0 +python-dotenv +scipy + diff --git a/etl/customers/united living/get_data.py b/etl/customers/united living/get_data.py new file mode 100644 index 00000000..bc4ab400 --- /dev/null +++ b/etl/customers/united living/get_data.py @@ -0,0 +1,73 @@ +import os +import pandas as pd +import numpy as np +from asset_list.utils import get_data +from backend.SearchEpc import SearchEpc +from etl.spatial.OpenUprnClient import OpenUprnClient + +from dotenv import load_dotenv + +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + + +def app(): + filepath = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/United Living/Potential GMCA props 05.03.xlsx" + + df = pd.read_excel(filepath) + df["row_id"] = df.index + + df["house_number"] = df.apply( + lambda x: SearchEpc.get_house_number(x["Address"], x["Postcode"]), + axis=1 + ) + + properties_data, _, _ = get_data( + df=df, + manual_uprn_map={}, + epc_auth_token=EPC_AUTH_TOKEN, + uprn_column=None, + fulladdress_column="Address", + address1_column="house_number", + postcode_column="Postcode", + property_type_column=None, + built_form_column=None, + epc_api_only=True, + row_id_name="row_id", + ) + + no_data = df[df["row_id"].isin(_)] + no_data[["Address", "Postcode"]] + + # 53 108 Alexandra Street OL6 9QP 100011536830 + # 56 301 Whiteacre Road OL6 9QF 100011557437 + # 65 97 Princess Street OL6 9QJ 100011551813 + + data = df.merge( + pd.DataFrame(properties_data)[["uprn", "row_id"]], + how="left", left_on="row_id", right_on="row_id" + ) + + # Fill missing UPRNS + data["uprn"] = np.where(data["Address"] == "108 Alexandra Street", 100011536830, data["uprn"]) + data["uprn"] = np.where(data["Address"] == "301 Whiteacre Road", 100011557437, data["uprn"]) + data["uprn"] = np.where(data["Address"] == "97 Princess Street", 100011551813, data["uprn"]) + + # We now get whether the property is listed, heritage or in a conservation area + spatial_data = OpenUprnClient.get_spatial_data(uprns=data["uprn"].tolist(), bucket_name="retrofit-data-dev") + spatial_data = spatial_data.rename(columns={"UPRN": "uprn"}) + + data["uprn"] = data["uprn"].astype(int) + + merged = data.merge( + spatial_data, how="left", on="uprn" + ) + # fill NAs + for c in ['conservation_status', 'is_listed_building', 'is_heritage_building']: + merged[c] = merged[c].fillna(False) + + merged.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/United Living/Potential GMCA props 05.03 - data " + "pulled.xlsx", + index=False + ) diff --git a/etl/customers/waltham_forest/whlg eligibile properties.py b/etl/customers/waltham_forest/whlg eligibile properties.py new file mode 100644 index 00000000..9e1949f7 --- /dev/null +++ b/etl/customers/waltham_forest/whlg eligibile properties.py @@ -0,0 +1,85 @@ +""" +This is the list of properties, based on the EPC data, that look eligible for WHLG +""" +import pandas as pd +from etl.epc.settings import EARLIEST_EPC_DATE +from etl.spatial.OpenUprnClient import OpenUprnClient + +epc_data = pd.read_csv( + "/Users/khalimconn-kowlessar/Downloads/all-domestic-certificates/domestic-E09000031-Waltham-Forest/certificates.csv" +) +epc_data.columns = [c.replace("_", "-").lower() for c in epc_data.columns] +epc_data = epc_data[epc_data["lodgement-date"] >= EARLIEST_EPC_DATE] + +epc_data = epc_data[~pd.isnull(epc_data["uprn"])] +epc_data["uprn"] = epc_data["uprn"].astype(int) + +epc_data = epc_data[epc_data["current-energy-rating"].isin(["D", "E", "F", "G"])] +epc_data = epc_data[epc_data["tenure"].isin( + ["rental (private)", "Rented (private)", "owner-occupied", "Owner-occupied"]) +] + +whlg_eligible_postcodes = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/WHLG-eligible-postcodes.xlsx", + sheet_name="Eligible postcodes", + header=1 +) +# Format: +whlg_eligible_postcodes = whlg_eligible_postcodes[['Postcode', 'Local Authority']] + +uprns = epc_data["uprn"].unique() +# Get data +ca_data = OpenUprnClient.get_spatial_data(uprns, "retrofit-data-dev") +epc_data = epc_data.merge( + ca_data[["UPRN", "conservation_status", "is_listed_building", "is_heritage_building"]].rename( + columns={"UPRN": "uprn"} + ), + how="left", + on="uprn", +) + +epc_data["has_conservation_restrictions"] = ( + (epc_data["conservation_status"] == True) + | (epc_data["is_listed_building"] == True) + | (epc_data["is_heritage_building"] == True) +) + +whlg_eligible_postcodes["Local Authority"].value_counts() + +whlg_eligible_postcodes = whlg_eligible_postcodes[whlg_eligible_postcodes["Local Authority"] == "Waltham Forest"] + +# Pathway 1: +# Match based on eligible postcodes +pathway1 = epc_data[epc_data["postcode"].isin(whlg_eligible_postcodes["Postcode"].values)] +pathway1 = pathway1[ + [ + "uprn", "address", "address1", "postcode", "current-energy-rating", "current-energy-efficiency", + "lodgement-date", + "has_conservation_restrictions", "walls-description", "roof-description", "mainheat-description" + ] +] + +pathway1 = pathway1.rename( + columns={ + "current-energy-rating": "EPC Rating", "current-energy-efficiency": "SAP Score", + "lodgement-date": "EPC Date", "has_conservation_restrictions": "Conservation Area Restrictions", + "walls-description": "Wall Type", "roof-description": "Roof Type", "mainheat-description": "Main Heating" + } +) + +pathway1["EPC Date"] = pd.to_datetime(pathway1["EPC Date"]).dt.strftime("%Y-%m-%d") +# Create a year EPC was lodged +pathway1["EPC Year"] = pd.to_datetime(pathway1["EPC Date"]).dt.year + +low_epc = pathway1[pathway1["EPC Rating"].isin(["F", "G"])] +low_epc["EPC Rating"].value_counts() +low_epc.tail(1)[["address", "postcode"]] + +pathway1.to_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Waltham Forest WHLG - Pathway 1 Eligibility.csv", + index=False +) + +# Pathway 2 or 3 +# The household will need to be means tested +pathway2 = epc_data[~epc_data["uprn"].isin(pathway1["uprn"].values)] diff --git a/etl/customers/warwick/remote_assessments.py b/etl/customers/warwick/remote_assessments.py new file mode 100644 index 00000000..a9b654b7 --- /dev/null +++ b/etl/customers/warwick/remote_assessments.py @@ -0,0 +1,123 @@ +import pandas as pd +from utils.s3 import save_csv_to_s3 + +PORTFOLIO_ID = 115 +USER_ID = 8 + + +def app(): + """ + Used to set up the remote assessments for Warwick + """ + + asset_list = [ + { + "uprn": 10033604792, + "address": "Flat 2, 3 Green Street", + "postcode": "W1K 6RN" + }, + { + "uprn": 10033604794, + "address": "Flat 4, 3 Green Street", + "postcode": "W1K 6RN" + }, + { + "uprn": 10033615515, + "address": "Apartment 4, 52 Green Street", + "postcode": "W1K 6RS" + } + ] + asset_list = pd.DataFrame(asset_list) + + # Store the asset list in s3 + filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv" + save_csv_to_s3( + dataframe=asset_list, + bucket_name="retrofit-plan-inputs-dev", + file_name=filename + ) + + non_invasive_recommendations = [ + { + "uprn": 10033604792, + "recommendations": [ + { + "type": "internal_wall_insulation", + "sap_points": 16, + "survey": True + } + ] + }, + { + "uprn": 10033604794, + "recommendations": [ + { + "type": "internal_wall_insulation", + "sap_points": 14, + "survey": True + } + ] + }, + { + "uprn": 10033615515, + "recommendations": [ + { + "type": "room_roof_insulation", + "sap_points": 12, + "survey": True + }, + { + "type": "internal_wall_insulation", + "sap_points": 2, + "survey": True + } + ] + } + ] + + # Store non-invasive recommendations in S3 + non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(non_invasive_recommendations), + bucket_name="retrofit-plan-inputs-dev", + file_name=non_invasive_recommendations_filename + ) + + valuation_data = [ + { + "uprn": 10033604792, + "value": 3_692_000 + }, + { + "uprn": 10033604794, + "value": 3_789_000 + }, + { + "uprn": 10033615515, + "value": 3_499_000 + } + ] + + # Store valuation data to s3 + valuation_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuation.csv" + save_csv_to_s3( + dataframe=pd.DataFrame(valuation_data), + bucket_name="retrofit-plan-inputs-dev", + file_name=valuation_filename + ) + + body = { + "portfolio_id": str(PORTFOLIO_ID), + "housing_type": "Private", + "goal": "Increasing EPC", + "goal_value": "C", + "trigger_file_path": filename, + "already_installed_file_path": "", + "patches_file_path": "", + "non_invasive_recommendations_file_path": non_invasive_recommendations_filename, + "valuation_file_path": valuation_filename, + "scenario_name": "Full package remote assessment", + "multi_plan": True, + "budget": None, + } + print(body) diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py index aca36584..76087a76 100644 --- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py +++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py @@ -1,7 +1,7 @@ import os import re import openpyxl -import Levenshtein +from fuzzywuzzy import fuzz from pathlib import Path import msgpack from datetime import datetime @@ -2771,7 +2771,8 @@ class DataLoader: match_to = [x.replace(" ", "") for x in match_to] # Perform matching between full key and match_to - distances = [Levenshtein.distance(matching_string, s) for s in match_to] + distances = [100 - fuzz.ratio(matching_string, s) for s in match_to] + best_match_index = distances.index(min(distances)) # We might want to consider a threshold for the distance, however for the momeny, # we don't consider this for the moment @@ -2897,6 +2898,17 @@ class DataLoader: # Merge onto the survey list survey_list = survey_list.merge(matching_lookup, how='left', on="survey_list_row_id") + # TEMP FOR NEWER WORK + # matching_lookup = matching_lookup.merge( + # asset_list[["asset_list_row_id", "UPRN"]], how="left", on="asset_list_row_id" + # ).merge( + # survey_list[["survey_list_row_id", "NO.", "Street / Block Name", "Post Code"]], + # how="left", on="survey_list_row_id" + # ) + # matching_lookup.to_csv( + # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Plus Dane/surveys_to_assets.csv" + # ) + return survey_list @staticmethod diff --git a/etl/epc/Dataset.py b/etl/epc/Dataset.py index 3f2e810e..83a85b78 100644 --- a/etl/epc/Dataset.py +++ b/etl/epc/Dataset.py @@ -203,11 +203,11 @@ class TrainingDataset(BaseDataset): common_cols = [[col + "_starting", col + "_ending"] for col in common_cols] self.df = self.df.loc[ - :, - no_suffix_cols - + only_ending_cols - + [col for cols in common_cols for col in cols], - ] + :, + no_suffix_cols + + only_ending_cols + + [col for cols in common_cols for col in cols], + ] def _remove_abnormal_change_in_floor_area(self): """ @@ -511,7 +511,7 @@ class TrainingDataset(BaseDataset): expanded_df["is_sandstone_or_limestone"] == expanded_df["is_sandstone_or_limestone_ending"] ) - ] + ] elif component == "floor": expanded_df = expanded_df[ (expanded_df["is_suspended"] == expanded_df["is_suspended_ending"]) @@ -528,7 +528,7 @@ class TrainingDataset(BaseDataset): expanded_df["is_to_external_air"] == expanded_df["is_to_external_air_ending"] ) - ] + ] elif component == "roof": expanded_df = expanded_df[ (expanded_df["is_pitched"] == expanded_df["is_pitched_ending"]) @@ -541,7 +541,7 @@ class TrainingDataset(BaseDataset): expanded_df["has_dwelling_above"] == expanded_df["has_dwelling_above_ending"] ) - ] + ] return expanded_df diff --git a/etl/epc/Record.py b/etl/epc/Record.py index 4c1a912b..9ff1de0a 100644 --- a/etl/epc/Record.py +++ b/etl/epc/Record.py @@ -139,28 +139,22 @@ class EPCRecord: self._clean_records_using_epc_records() self._clean_with_data_processor() - self._expand_prepared_epc_to_attributes() - self._identify_delta_between_prepared_and_original_records() # Process to create uvalues for the single epc record - - # selff.df = self.epc_record_as_dataframe('prepared_epc') - + # self.df = self.epc_record_as_dataframe('prepared_epc') # self._feature_generation() # self._drop_features() return - self._expand_description_to_features() - self._expand_description_to_uvalues() - + # self._expand_description_to_features() + # self._expand_description_to_uvalues() + # # self._generate_uvalues() # self._validate_expanded_description() # self._validate_u_values() - # etc - pass def _drop_features(self): """ @@ -359,6 +353,8 @@ class EPCRecord: self._clean_property_dimensions() self._clean_number_lighting_outlets() self._clean_floor_level() + self._clean_floor_height() + self._clean_constituency() # self._clean_potential_energy_efficiency() # self._clean_environment_impact_potential() @@ -387,6 +383,31 @@ class EPCRecord: return df + def _clean_floor_height(self): + """ Remaps anomalies in floor height to the average floor height for the property type """ + floor_height_data = self.cleaning_data[ + (self.cleaning_data["property_type"] == self.prepared_epc["property-type"]) & + (self.cleaning_data["built_form"] == self.prepared_epc["built-form"]) + ] + average = floor_height_data["floor_height"].mean() + sd = floor_height_data["floor_height"].std() + # If we're in the top 0.5 percentile of floor heights, we'll set it to the average + if self.prepared_epc["floor-height"] > average + 10 * sd: + self.prepared_epc["floor-height"] = average + if self.prepared_epc["floor-height"] <= 1.665: + self.prepared_epc["floor-height"] = average + + def _clean_constituency(self): + """ + We handle the single case of finding a missing constituency by using the local authority + """ + if pd.isnull(self.prepared_epc["constituency"]) or (self.prepared_epc["constituency"] == ""): + if self.prepared_epc["local-authority"] != "E06000044": + raise NotImplementedError( + "This function is only implemented for Portsmouth, in the single edgecase seen" + ) + self.prepared_epc["constituency"] = "E14000883" + def _clean_floor_level(self): """ This method will clean the floor level, if empty or invalid diff --git a/etl/find_my_epc/AssetListEpcData.py b/etl/find_my_epc/AssetListEpcData.py new file mode 100644 index 00000000..f085c8fb --- /dev/null +++ b/etl/find_my_epc/AssetListEpcData.py @@ -0,0 +1,133 @@ +import time +import pandas as pd +from tqdm import tqdm +from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc +from backend.SearchEpc import SearchEpc +from utils.logger import setup_logger + +logger = setup_logger() + + +class AssetListEpcData: + + def __init__(self, asset_list: pd.DataFrame, epc_auth_token: str): + + """ + This class handles pulling data assocaited to an asset list and performs common functions like + getting EPC api data, retrieveing data form the find my epc website and extracting non-intrusive + recommendations + :param asset_list: + """ + + # Check the asset list contains the correct columns + + self.asset_list = self.check_asset_list(asset_list) + self.epc_auth_token = epc_auth_token + + self.extracted_data = None + self.non_invasive_recommendations = None + self.patches = None + + @staticmethod + def check_asset_list(asset_list): + # TODO: Update this with pydantic + + return asset_list + + def get_non_invasive_recommendations(self): + + """ + Extracts non-invasive recommendations in a format that can be used by the engine + :return: + """ + + if self.extracted_data is None: + raise ValueError("Please run get_data first") + + self.non_invasive_recommendations = [ + { + "uprn": r.get("uprn"), + "address": r["address"], + "postcode": r["postcode"], + "recommendations": r["recommendations"] + } for r in self.extracted_data + ] + + def get_patch(self): + """ + + :return: + """ + if self.extracted_data is None: + raise ValueError("extracted data is missing - run get_data first") + + self.patches = [ + { + "uprn": r.get("uprn"), + **r.get("patch") + } for r in self.extracted_data if r.get("patch") + ] + + def get_data(self): + + logger.info("Retrieving data for given asset list") + + # Pull the additional data + extracted_data = [] + for _, home in tqdm(self.asset_list.iterrows(), total=len(self.asset_list)): + add1 = home["address"] + pc = home["postcode"] + # Retrieve the EPC data + epc_searcher = SearchEpc( + address1=add1, + postcode=pc, + uprn=home.get("uprn"), + auth_token=self.epc_auth_token, + os_api_key="", + ) + epc_searcher.ordnance_survey_client.property_type = home.get("property_type") + epc_searcher.ordnance_survey_client.built_form = home.get("built_form") + epc_searcher.find_property(skip_os=True) + + if epc_searcher.newest_epc is None: + continue + + if not pd.isnull(home.get("patch")): + epc_searcher.newest_epc["address1"] = add1 + + # Attempt both methods: + try: + find_epc_searcher = RetrieveFindMyEpc( + address=epc_searcher.newest_epc["address1"] + ", " + epc_searcher.newest_epc["address2"], + postcode=epc_searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + except Exception as e: + logger.error(f"Error retrieving find my epc data: {e}") + find_epc_searcher = RetrieveFindMyEpc( + address=epc_searcher.newest_epc["address1"], + postcode=epc_searcher.newest_epc["postcode"] + ) + find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data() + time.sleep(0.5) + # We need uprn + + to_append = { + "uprn": home.get("uprn"), + "address": home["address"], + "postcode": home["postcode"], + **find_epc_data, + } + if not pd.isnull(home.get("patch")): + to_append["patch"] = { + "current-energy-rating": find_epc_data["current_epc_rating"], + "current-energy-efficiency": find_epc_data["current_epc_efficiency"], + "potential-energy-rating": find_epc_data["potential_epc_rating"], + "potential-energy-efficiency": find_epc_data["potential_epc_efficiency"], + **find_epc_data["epc_data"] + } + + extracted_data.append(to_append) + + self.extracted_data = extracted_data + logger.info("Data Extrction complete") diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py new file mode 100644 index 00000000..86c3fda1 --- /dev/null +++ b/etl/find_my_epc/RetrieveFindMyEpc.py @@ -0,0 +1,480 @@ +import re +import pandas as pd +import requests +from bs4 import BeautifulSoup +from datetime import datetime + +from utils.logger import setup_logger + +logger = setup_logger() + + +class RetrieveFindMyEpc: + SEARCH_POSTCODE_URL = ( + "https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}" + ) + BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk" + + HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' + 'Chrome/111.0.0.0 Safari/537.36' + } + + def __init__(self, address: str, postcode: str): + """ + This class is tasked with retrieving the latest EPC data from the find my epc website + :param address: The address of the property + :param postcode: The postcode of the property + """ + self.address = address + self.postcode = postcode + + self.address_cleaned = self.address.replace(",", "").replace(" ", "").lower() + self.walls = [] + + @staticmethod + def extract_low_carbon_sources(soup): + # Find the section header + section_header = soup.find("h3", string="Low and zero carbon energy sources") + if not section_header: + return {} + + # Locate the list following the header + energy_list = section_header.find_next("ul") + + # Extract the list items + sources = {item.get_text(strip=True): True for item in energy_list.find_all("li")} + return sources + + @staticmethod + def get_text(elem): + return elem.get_text(strip=True) if elem else None + + def extract_epc_data(self, soup): + + results = {} + + # 1. Total floor area + results['total-floor-area'] = int(self.get_text( + soup.find("dt", string="Total floor area").find_next_sibling("dd") + ).split(" ")[0]) + + # Table with features + rows = soup.select("table.govuk-table tbody tr") + + rating_map = { + "Very poor": "Very Poor", + "Very good": "Very Good" + } + + def get_feature_row_text(feature_name, index=0): + matches = [row for row in rows if row.find("th") and feature_name in row.find("th").text] + if len(matches) > index: + cells = matches[index].find_all("td") + description = self.get_text(cells[0]) + rating = self.get_text(cells[1]) + return description, rating_map.get(rating, rating) + return None, None + + # 2-3. First wall description and rating + results['walls-description'], results['walls-energy-eff'] = get_feature_row_text("Wall", 0) + + # 4-5. First roof description and rating + results['roof-description'], results['roof-energy-eff'] = get_feature_row_text("Roof", 0) + + # 6-7. Windows description and rating + results['windows-description'], results['windows-energy-eff'] = get_feature_row_text("Window") + + # 8-9. Main heating description and rating + results['mainheat-description'], results['mainheat-energy-eff'] = get_feature_row_text("Main heating") + + # 10-11. Main heating control description and rating + results['mainheatcont-description'], results['mainheatc-energy-eff'] = get_feature_row_text( + "Main heating control" + ) + + # 12-13. Hot water description and rating + results['hotwater-description'], results['hot-water-energy-ef'] = get_feature_row_text("Hot water") + + # 14-15. Lighting description and rating + results['lighting-description'], results['lighting-energy-eff'] = get_feature_row_text("Lighting") + + # 16. Floor description + results['floor-description'], _ = get_feature_row_text("Floor") + + # 17. Secondary heating description + results['secondheat-description'], _ = get_feature_row_text("Secondary heating") + + # 18. Primary energy use + p_energy = soup.find(string=lambda t: "primary energy use for this property per year" in t.lower()) + # We should always have this + match = re.search(r"(\d+)\s+kilowatt", p_energy) + results['energy-consumption-current'] = int(match.group(1)) if match else None + + # 19. Current CO2 emissions + co2_now = soup.find("dd", id="eir-property-produces") + # We should always have this + match = re.search(r"([\d.]+)", co2_now.text) + results['co2-emissions-current'] = float(match.group(1)) if match else None + # Need co2-emiss-curr-per-floor-area + + # 20. Potential CO2 emissions + co2_pot = soup.find("dd", id="eir-potential-production") + match = re.search(r"([\d.]+)", co2_pot.text) + results['co2-emissions-potential'] = float(match.group(1)) if match else None + + return results + + def retrieve_newest_find_my_epc_data(self, sap_2012_date=None): + """ + For a post code and address, we pull out all the required data from the find my epc website + """ + + postcode_input = self.postcode.replace(" ", "+") + postcode_search = self.SEARCH_POSTCODE_URL.format(postcode_input=postcode_input) + postcode_response = requests.get(postcode_search, headers=self.HEADERS) + + postcode_res = BeautifulSoup(postcode_response.text, features="html.parser") + rows = postcode_res.find_all('tr', class_='govuk-table__row') + + extracted_table = [] + for row in rows: + # Extract the address and URL + address_tag = row.find('a', class_='govuk-link') + if address_tag is None: + continue + extracted_address = None + extracted_address_url = None + if address_tag: + extracted_address = address_tag.text.strip() + extracted_address_url = address_tag['href'] + + extracted_address_cleaned = extracted_address.replace(",", "").replace(" ", "").lower() + if not extracted_address_cleaned.startswith(self.address_cleaned): + continue + + # If the address is a match, we can extract the data + + # Extract the expiry date + expiry_date_tag = row.find('td', class_='govuk-table__cell date') + expiry_date = None + if expiry_date_tag is not None: + expiry_date = expiry_date_tag.parent.find('span').text.strip() + + extracted_table.append( + { + "extracted_address": extracted_address, + "extracted_address_url": extracted_address_url, + "expiry_date": datetime.strptime(expiry_date, '%d %B %Y'), + } + ) + + if not extracted_table: + raise ValueError("No EPC found") + + if len(extracted_table) > 1: + # We take the one with the most recent expiry date + extracted_table = sorted(extracted_table, key=lambda x: x['expiry_date'], reverse=True) + + chosen_epc = self.BASE_ENERGY_URL + extracted_table[0]['extracted_address_url'] + epc_certificate = chosen_epc.split('/')[-1] + + address_response = requests.get(chosen_epc, headers=self.HEADERS) + address_res = BeautifulSoup(address_response.text, features="html.parser") + + # Key data we want to retrieve: + # 1) Rating + # 2) Bills estimates + # 3) Recommendations and SAP points + # 4) Low and zero carbon energy sources + # 5) The wall types of the property - used for determining if we have an extension wall insulation# + # recommendation + + ratings = address_res.find('desc', {'id': 'svg-desc'}).text + current_rating = ratings.split(".")[0] + potential_rating = ratings.split(".")[1] + current_sap = int(current_rating.split(' ')[-1]) + + # Floor area + address_res.find() + + # Retrieve the energy consumption + bills = address_res.find('div', {'id': 'bills-affected'}) + bills_list = bills.find_all('li') + if not bills_list: + # If this is the case, it's usually becaue the EPC was very old. Early EPCs did not have this information + heating_text = None + hot_water_text = None + else: + heating_text = bills_list[0].text + hot_water_text = bills_list[1].text + + # Retrieve the recommendations and SAP points + recommendations = [] + recommendations_div = address_res.find('div', class_='epb-recommended-improvements') + if recommendations_div: + # Find all h3 headers for each step and extract their related information + step_headers = recommendations_div.find_all('h3', class_='govuk-heading-m') + previous_sap_score = current_sap + previous_epc = current_rating.split(' ')[-6] + for step_num, step_header in enumerate(step_headers, start=1): + # Extract the step title (the measure) + measure_title = step_header.text.strip().replace(f"Step {step_num}: ", "") + + # Find the div containing the potential rating within the same section + potential_rating_div = step_header.find_next( + 'div', class_='epb-recommended-improvements__potential-rating' + ) + + # Check if the potential rating div is found + if potential_rating_div: + # Extract the rating text within the SVG text element + extracted_rating_text = potential_rating_div.find('text', class_='govuk-!-font-weight-bold') + if extracted_rating_text is not None: + rating_text = extracted_rating_text.text.strip() + else: + rating_text = " ".join([str(previous_sap_score), previous_epc]) + # Parse the rating text to separate the numeric rating and EPC letter + new_rating = int(rating_text.split()[0]) + new_epc = rating_text.split()[1] + + # Append the information as a dictionary to the recommendations list + recommendations.append({ + "step": step_num, + "measure": measure_title, + "new_rating": new_rating, + "new_epc": new_epc, + "sap_points": new_rating - previous_sap_score + }) + previous_sap_score = new_rating + previous_epc = new_epc + + # Search for the assessment informaton + assessment_information = address_res.find('div', {'id': 'information'}) + # Parse this information + rows = assessment_information.find_all('div', class_='govuk-summary-list__row') + # Create a dictionary to hold the parsed information + assessment_data = {} + for row in rows: + key = row.find('dt').text.strip() + if key == "Type of assessment": + # We dont reliably extract this + continue + value_tag = row.find('dd') + + # Check if value contains a link (email) + if value_tag.find('a'): + value = value_tag.find('a').text.strip() + elif value_tag.find('summary'): + value = value_tag.find('span').text.strip() + else: + value = value_tag.text.strip() + + # These are keys that we have for both the surveyor and the acreditation scheme. Firstly, we'll + # get the surveyor's name and email so we make that information clear + if key in ["Telephone", "Email"]: + if "Assessor's " + key not in assessment_data: + assessment_data["Assessor's " + key] = value + else: + assessment_data["Accreditation Scheme's " + key] = value + continue + + assessment_data[key] = value + + expected_keys = [ + 'Assessor’s name', + "Assessor's Telephone", + "Assessor's Email", + 'Assessor’s ID', + 'Accreditation scheme', + 'Assessor’s declaration', + "Accreditation Scheme's Telephone", + "Accreditation Scheme's Email", + 'Date of assessment', + 'Date of certificate' + ] + # Check we have all the expected keys + for key in expected_keys: + if key not in assessment_data: + raise ValueError(f"Missing key: {key}") + + # The wall types of the property + property_features_table = address_res.find("tbody", class_="govuk-table__body") + property_features_table = property_features_table.find_all("tr") + + # Extract wall types + self.walls = [] + for row in property_features_table: + cells = row.find_all("td") + if row.find("th").text.strip() == "Wall": + self.walls.append(cells[0].text.strip()) + + # Finally, we format the recommendations + recommendations = self.format_recommendations(recommendations, assessment_data, sap_2012_date) + + # 4) Low and zero carbon energy sources + low_carbon_energy_sources = self.extract_low_carbon_sources(address_res) + + # 5) Pull out the EPC data + epc_data = self.extract_epc_data(address_res) + + resulting_data = { + 'epc_certificate': epc_certificate, + 'current_epc_rating': current_rating.split(' ')[-6], + 'current_epc_efficiency': current_sap, + 'potential_epc_rating': potential_rating.split(' ')[-6], + "potential_epc_efficiency": int(potential_rating.split(' ')[-1]), + "heating_text": heating_text, + "hot_water_text": hot_water_text, + "recommendations": recommendations, + "epc_data": epc_data, + **assessment_data, + **low_carbon_energy_sources, + } + + return resulting_data + + def format_recommendations(self, recommendations, assessment_data, sap_2012_date=None): + """ + This function converts the recommendations to a format that we can use in the engine as a non-intrusive survey + :param recommendations: The recommendations from the EPC + :param assessment_data: The assessment data from the EPC + :param sap_2012_date: The date of the SAP 2012 update + """ + + measure_map = { + "Internal or external wall insulation": ["internal_wall_insulation", "external_wall_insulation"], + "Hot water cylinder insulation": ["hot_water_tank_insulation"], + "Hot water cylinder thermostat": ["cylinder_thermostat"], + "High performance external doors": ["insulated_doors"], + "Floor insulation (solid floor)": ["solid_floor_insulation"], + "Floor insulation (suspended floor)": ["suspended_floor_insulation"], + "Double glazed windows": ["double_glazing"], + "Cavity wall insulation": ["cavity_wall_insulation"], + "Replace boiler with new condensing boiler": ["boiler_upgrade"], + "Floor insulation": ["floor_insulation"], # Recommendation typically associated to older EPCs + "Heating controls (programmer, room thermostat and TRVs)": [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ], + "Low energy lighting": ["low_energy_lighting"], + "Increase loft insulation to 270 mm": ["loft_insulation"], + "Heating controls (thermostatic radiator valves)": [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ], + "Solar water heating": ["solar_water_heating"], + "Solar photovoltaic panels, 2.5 kWp": ["solar_pv"], + "Heating controls (room thermostat and TRVs)": [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ], + "Change heating to gas condensing boiler": ["boiler_upgrade"], + "Fan assisted storage heaters and dual immersion cylinder": ["high_heat_retention_storage_heater"], + "Flat roof or sloping ceiling insulation": ["flat_roof_insulation"], + "Heating controls (room thermostat)": [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ], + "Band A condensing boiler": ["boiler_upgrade"], + "Double glazing": ["double_glazing"], + "Flue gas heat recovery device in conjunction with boiler": ["flue_gas_heat_recovery"], + "Wind turbine": ["wind_turbine"], + "Loft insulation": ["loft_insulation"], + "Solar photovoltaic (PV) panels": ["solar_pv"], + "Party wall insulation": ["party_wall_insulation"], + 'Draught proofing': ["draught_proofing"], + "Roof insulation recommendation": [], + "Cavity wall insulation recommendation": [], + "Windows draught proofing": [], + "Low energy lighting for all fixed outlets": ["low_energy_lighting"], + "Cylinder thermostat recommendation": [], + "Heating controls recommendation": [], + "Replace boiler with Band A condensing boiler": ["boiler_upgrade"], + "Band A condensing gas boiler": ["boiler_upgrade"], + "Solar panel recommendation": [], + "Double glazing recommendation": [], + "Solid wall insulation recommendation": [], + "Fuel change recommendation": [], + "PV Cells recommendation": [], + "Replacement glazing units": ["double_glazing"], + "Heating controls (time and temperature zone control)": ["time_temperature_zone_control"], + "High heat retention storage heaters": ["high_heat_retention_storage_heater"], + "Gas condensing boiler": ["boiler_upgrade"], + "Change room heaters to condensing boiler": ["boiler_upgrade"], + "Cylinder thermostat": ["cylinder_thermostat"], + "Heat recovery system for mixer showers": ["heat_recovery_shower"], + "Room-in-roof insulation": ["room_in_roof_insulation"], + "Fan assisted storage heaters": [], + "Fan-assisted storage heaters": [], + "Step 1:": [], + "Step 2:": [], + 'Step 3:': [], + "Biomass stove with boiler": [], + "Replace boiler with biomass boiler": [], + "Heating controls (room thermostat and thermostatic radiator valves)": [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ], + "Heating controls (programmer, and thermostatic radiator valves)": [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ], + "Heating controls (programmer and TRVs)": [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ], + "Heating controls (programmer and room thermostat)": [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ], + "Replacement warm air unit": [], + "Secondary glazing": ["secondary_glazing"], + "Condensing heating unit": ["boiler_upgrade"], + '???': [], + 'Solar photovoltaic panels, 2.5kWp': ["solar_pv"], + 'Heating controls (programmer, room thermostat and thermostatic radiator valves)': [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ], + 'Translation missing: en.improvement_code.41.title': [], + "Condensing boiler (separate from the range cooker)": ["boiler_upgrade"], + "Heating controls (programmer and thermostatic radiator valves)": [ + "roomstat_programmer_trvs", "time_temperature_zone_control" + ] + } + + survey = True + if sap_2012_date is not None: + certificate_date = datetime.strptime(assessment_data["Date of certificate"], "%d %B %Y") + if certificate_date < pd.to_datetime(sap_2012_date): + survey = False + + formatted_recommendations = [] + for rec in recommendations: + mapped = measure_map[rec["measure"]] + for measure in mapped: + if measure == "cavity_wall_insulation" and "solid brick" in self.walls[0].lower(): + measure = "extension_cavity_wall_insulation" + to_append = { + "type": measure, + "sap_points": rec["sap_points"], + "survey": survey, + } + if measure == "solar_pv": + to_append["suitable"] = True + formatted_recommendations.append(to_append) + + return formatted_recommendations + + @classmethod + def get_from_epc(cls, epc): + # Attempt both methods: + try: + searcher = cls(address=epc["address"], postcode=epc["postcode"]) + find_epc_data = searcher.retrieve_newest_find_my_epc_data() + except Exception as e: + logger.error(f"Error retrieving find my epc data: {e}") + # We attempt with the backup add + searcher = cls(address=epc["address1"], postcode=epc["postcode"]) + find_epc_data = searcher.retrieve_newest_find_my_epc_data() + + non_invasive_recommendations = { + "uprn": epc["uprn"], + "address": epc["address"], + "postcode": epc["postcode"], + "recommendations": find_epc_data["recommendations"], + } + + return non_invasive_recommendations diff --git a/etl/find_my_epc/requirements.txt b/etl/find_my_epc/requirements.txt new file mode 100644 index 00000000..9a3fc73f --- /dev/null +++ b/etl/find_my_epc/requirements.txt @@ -0,0 +1,2 @@ +pandas +beautifulsoup4 \ No newline at end of file diff --git a/etl/funding/app.py b/etl/funding/app.py new file mode 100644 index 00000000..fba48ca4 --- /dev/null +++ b/etl/funding/app.py @@ -0,0 +1,35 @@ +""" +This scipt prepares the data, required for us to perform funding calculations. The starting data should be stored +on the machine this is being run on, and this will prepare the information and upload if +""" +import pandas as pd +from utils.s3 import save_csv_to_s3 + +STAGE = "dev" +DATA_BUCKET = "retrofit-data-{stage}" +PROJECTS_SCORES_MATRIX_LOCATION = "/Users/khalimconn-kowlessar/Downloads/ECO4 Full Project Scores Matrix.csv" +WHLG_ELIGIBLE_POSTCODES = "/Users/khalimconn-kowlessar/Downloads/WHLG-eligible-postcodes.xlsx" + + +def app(): + # Read in the project scores matrix + project_scores_matrix = pd.read_csv(PROJECTS_SCORES_MATRIX_LOCATION) + + # Store in AWS S3 + save_csv_to_s3( + dataframe=project_scores_matrix, + bucket_name=DATA_BUCKET.format(stage=STAGE), + file_name="funding/ECO4 Full Project Scores Matrix.csv" + ) + + # Read in the Warm Homes Local Grant eligible postcodes data + whlg_eligible_postcodes = pd.read_excel(WHLG_ELIGIBLE_POSTCODES, sheet_name="Eligible postcodes", header=1) + # We tidy up the data before we store + whlg_eligible_postcodes = whlg_eligible_postcodes[["Postcode"]] + whlg_eligible_postcodes["Postcode"] = whlg_eligible_postcodes["Postcode"].str.lower() + + save_csv_to_s3( + dataframe=whlg_eligible_postcodes, + bucket_name=DATA_BUCKET.format(stage=STAGE), + file_name="funding/whlg eligible postcodes.csv" + ) diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py new file mode 100644 index 00000000..c1da35dd --- /dev/null +++ b/etl/lodgement/app.py @@ -0,0 +1,326 @@ +import os + +import pandas as pd + +import utils.file_data_extraction as file_extraction_tools +from utils.fullSapParser import FullSapParser +from utils.OsmosisCondtionReportParser import OsmosisConditionReportParser + +output_template = { + "Property Address": None, + "Osm. ID": None, + "Postcode": None, + "City/County": None, + "District/Town": None, + "Funding Stream": None, + # "Risk Path": None, + "Local Authority": None, + "Trustmark Lodgement ID": None, + "Certificate Number": None, + "EWI UMR": None, + "Loft UMR": None, + "Windows UMR": None, + "Doors UMR": None, + "Measure Lodgement Date": None, + "Full Lodgement Date": None, + "Owner - Name": None, + "Owner - Phone": None, + "Owner - Email": None, + "Tenant - Name": None, + "Tenant - Phone": None, + "R. Assessor - Name": None, + "R. Coordinator - Name": None, + "Trustmark Licence Number": None, + "Retrofit Assessment Date": None, + "Company Name": None, + "Retrofit Designer Name": None, + "Property Type": None, + "Property Detachment": None, + "No. of Bedrooms": None, + "Property age": None, + "SAP Rating Pre (from IMA)": None, + "Pre Heat Transfer": None, + "Pre Total Floor Area": None, + "Pre Heat Demand": None, + "Pre Air Tightness": None, + "SAP Rating Post (from EPC)": None, + "Post Heat Transfer": None, + "Post Total Floor Area": None, + "Post Heat Demand": None, + "Post Air Tightness": None, + "Number of Eligible Measures Installed": None, + "Total Cost of Works": None, + "Annual Fuel Saving (MTP)": None, +} + + +def update_dictionary_with_check(dictionary, updates): + """ + Updates a dictionary with key-value pairs, raising an error if the key does not exist. + + Args: + dictionary (dict): The dictionary to update. + updates (dict): The updates to apply. + + Raises: + KeyError: If a key in updates does not exist in the dictionary. + """ + for key, value in updates.items(): + if key not in dictionary: + raise KeyError(f"Key '{key}' does not exist in the dictionary.") + dictionary[key] = value + + +def handler(): + """ + This is a simple application that will extract the data from documents that have been uploaded to Sharepoint + to populate the lodgement spreadsheet with + :return: + """ + + # Ths source data will eventually come from Sharepoint + source_data_path = "/Users/khalimconn-kowlessar/Documents/hestia/Lodgment Pilot" + output_template_file = "Trustmark Details - Template REV.25.11.24.xlsx" + funding_stream = "HUG2" + customer_name = "Shropshire Council" + customer_phone = "0345 678 9000" + customer_email = "affordablewarmth@shropshire.gov.uk" + + # TODO: In order for this to go live, we need to use Poppler, which needs to be installed + # w/ brew install poppler + # We also need to install Tesseract: brew install tesseract + + # List the folders in the source data path + folders = [x for x in os.listdir(source_data_path) if os.path.isdir(os.path.join(source_data_path, x))] + + extractors = { + "elmhurst epr": file_extraction_tools.ElmhurstEprExtractor, + "elmhurst summary report": file_extraction_tools.ElmhurstSummaryReportExtractor, + "osmosis condition report": OsmosisConditionReportParser, + "elmhurst evidence report": None, + "full sap xml": FullSapParser, + "pulse air permeability": file_extraction_tools.PulseAirPermeabilityExtractor, + "elmhurst project handover": file_extraction_tools.ElmhurstProjectHandoverExtractor, + "core logic pas assessment report": file_extraction_tools.CoreLogicPasAssessmentReportExtractor, + } + + extracted = [] + for property_folder in folders: + + property_folder_path = os.path.join(source_data_path, property_folder) + # List the folders in the source data path + subfolders = [ + x for x in os.listdir(property_folder_path) if os.path.isdir(os.path.join(property_folder_path, x)) + ] + coord_folder = os.path.join(property_folder_path, [f for f in subfolders if "RA Coordinator Info" in f][0]) + + # Get the contents of the folder + coordinator_folder_contents = [ + file for file in os.listdir(coord_folder) if os.path.isfile(os.path.join(coord_folder, file)) + ] + + # We detect the various file types + extracted_contents = {} + for filename in coordinator_folder_contents: + filepath = os.path.join(coord_folder, filename) + if file_extraction_tools.is_pdf(filepath): + report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath) + if report_type is None: + raise ValueError(f"Unknown report type for {filename}") + + file_extractor = extractors[report_type] + if file_extractor is None: + continue + + extracted_contents[report_type] = file_extractor(filepath).extract() + + if file_extraction_tools.is_xml(filepath): + xml_type = file_extraction_tools.detect_xml_report_type(xml_path=filepath) + if xml_type is None: + raise ValueError(f"Unknown report type for {filename}") + file_extractor = extractors.get(xml_type) + if file_extractor is None: + continue + + extracted_contents[xml_type] = file_extractor(filepath).extract() + + att_folder = os.path.join(property_folder_path, [f for f in subfolders if "Air Tightness Tests" in f][0]) + att_folder_contents = [ + file for file in os.listdir(att_folder) if os.path.isfile(os.path.join(att_folder, file)) + ] + + for filename in att_folder_contents: + filepath = os.path.join(att_folder, filename) + if file_extraction_tools.is_pdf(filepath): + report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath) + if report_type is None: + raise ValueError(f"Unknown report type for {filename}") + file_extractor = extractors[report_type] + + if file_extractor is None: + continue + + extracted_contents[report_type] = file_extractor(filepath).extract() + + lodgement_folder = os.path.join( + property_folder_path, [f for f in subfolders if "TrustMark Lodgement" in f][0] + ) + # Within the lodgement folder, we want the required documents sub-folder + lodgement_subfolders = [ + file for file in os.listdir(lodgement_folder) if os.path.isdir(os.path.join(lodgement_folder, file)) + ] + required_documents_folder = os.path.join( + lodgement_folder, [f for f in lodgement_subfolders if "required documents" in f.lower()][0] + ) + # List the contents + required_documents_contents = [ + file for file in os.listdir(required_documents_folder) if + os.path.isfile(os.path.join(required_documents_folder, file)) + ] + + # There are only a few file types we actually want to process in here for the moment + for filename in required_documents_contents: + filepath = os.path.join(required_documents_folder, filename) + if file_extraction_tools.is_pdf(filepath): + report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath) + if report_type != "elmhurst project handover": + continue + file_extractor = extractors[report_type] + + extracted_contents[report_type] = file_extractor(filepath).extract() + + output_row_data = output_template.copy() + + # dict_keys([ 'City/County', 'District/Town', + # 'Local Authority', 'Trustmark Lodgement ID', 'Certificate Number', 'EWI UMR', 'Loft UMR', 'Windows UMR', + # 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date', 'Owner - Name', 'Owner - Phone', + # 'Owner - Email', 'Tenant - Name', 'Tenant - Phone', + # 'Trustmark Licence Number', + # Pre Air Tightness', 'SAP Rating Post (from EPC)', 'Post Heat + # Transfer', 'Post Total Floor Area', 'Post Heat Demand', 'Post Air Tightness', + # 'Total Cost of Works', 'Annual Fuel Saving (MTP)']) + + update_dictionary_with_check( + output_row_data, + { + "Funding Stream": funding_stream, + "Property Address": property_folder.split(")")[1].strip(), + "Osm. ID": property_folder.split(")")[0].strip().lstrip("(").strip(), + } + ) + + if extracted_contents.get("elmhurst epr"): + total_floor_area = sum( + [x["Floor Area (m2)"] for x in extracted_contents["elmhurst epr"]["Building Parts"]] + + # Get the conservatory floor area + [extracted_contents["elmhurst epr"]["Conservatory"]["Conservatory Floor Area"]] + ) + + pre_heat_transfer = extracted_contents["elmhurst epr"]["Primary Energy Use Intensity (kWh/m2/yr)"] + pre_heat_demand = ( + extracted_contents["elmhurst epr"]["Primary Energy Use Intensity (kWh/m2/yr)"] * total_floor_area + ) + + epr_to_insert = { + "Postcode": extracted_contents["elmhurst epr"]["Postcode"], + "City/County": extracted_contents["elmhurst epr"]["County"], + "District/Town": extracted_contents["elmhurst epr"]["Town"], + "Local Authority": None, + 'SAP Rating Pre (from IMA)': extracted_contents["elmhurst epr"]["Current SAP Rating"], + 'Pre Heat Transfer': pre_heat_transfer, + 'Pre Total Floor Area': total_floor_area, + 'Pre Heat Demand': pre_heat_demand, + "R. Assessor - Name": extracted_contents["elmhurst epr"]["Assessor Name"], + "Retrofit Assessment Date": extracted_contents["elmhurst epr"]["Assessment Date"], + } + update_dictionary_with_check( + output_row_data, + epr_to_insert + ) + + if extracted_contents.get("full sap xml"): + xml_to_insert = { + "Property Type": extracted_contents["full sap xml"]["Property Type"], + "Property Detachment": extracted_contents["full sap xml"]["Built Form"], + "Property age": extracted_contents["full sap xml"]["Age Band"], + + } + update_dictionary_with_check( + output_row_data, + xml_to_insert + ) + + if extracted_contents.get("osmosis condition report"): + cr_to_insert = { + "No. of Bedrooms": extracted_contents["osmosis condition report"]["No. of Bedrooms"], + # "Risk Path": extracted_contents["osmosis condition report"]["Risk Assessment Pathway"], + } + update_dictionary_with_check( + output_row_data, + cr_to_insert + ) + + if extracted_contents.get("elmhurst summary report"): + total_floor_area = sum( + [x["Floor Area (m2)"] for x in extracted_contents["elmhurst summary report"]["Building Parts"]] + + # Get the conservatory floor area + [extracted_contents["elmhurst summary report"]["Conservatory"]["Conservatory Floor Area"]] + ) + + pre_heat_transfer = ( + extracted_contents["elmhurst summary report"]["Primary Energy Use Intensity (kWh/m2/yr)"] + ) + pre_heat_demand = None # Don't have this + + summary_to_insert = { + "Postcode": extracted_contents["elmhurst summary report"]["Postcode"], + "City/County": extracted_contents["elmhurst summary report"]["County"], + "District/Town": extracted_contents["elmhurst summary report"]["Town"], + 'SAP Rating Pre (from IMA)': extracted_contents["elmhurst summary report"]["Current SAP Rating"], + 'Pre Heat Transfer': pre_heat_transfer, + 'Pre Total Floor Area': total_floor_area, + 'Pre Heat Demand': pre_heat_demand, + "R. Assessor - Name": extracted_contents["elmhurst summary report"]["Assessor Name"], + "Retrofit Assessment Date": extracted_contents["elmhurst summary report"]["Assessment Date"], + } + + update_dictionary_with_check( + output_row_data, + summary_to_insert + ) + + if extracted_contents.get("pulse air permeability"): + # We extract the AP50 number + results_table = extracted_contents["pulse air permeability"]["Results Table"] + ap50 = [x["Extrapolated @ 50PA"] for x in results_table if x["Metric"] == "Air Permeability"][0] + update_dictionary_with_check( + output_row_data, + {"Pre Air Tightness": ap50} + ) + + if extracted_contents.get("elmhurst project handover"): + handover_to_insert = { + "Number of Eligible Measures Installed": len( + extracted_contents["elmhurst project handover"]["Measures Fitted"] + ), + "Retrofit Designer Name": extracted_contents["elmhurst project handover"]["Designer Name"], + "Company Name": extracted_contents["elmhurst project handover"]["Installer Name"], + "R. Coordinator - Name": extracted_contents["elmhurst project handover"]["Retrofit Coordinator Name"], + } + update_dictionary_with_check(output_row_data, handover_to_insert) + + if extracted_contents.get("core logic pas assessment report"): + cr_to_insert = { + "No. of Bedrooms": extracted_contents["core logic pas assessment report"]["Number of bedrooms"], + } + update_dictionary_with_check( + output_row_data, + cr_to_insert + ) + + extracted.append(output_row_data) + + extracted_df = pd.DataFrame(extracted) + + extracted_df.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Lodgment Pilot/poc-extrcted-data.csv", + index=False) diff --git a/etl/lodgement/requirements.txt b/etl/lodgement/requirements.txt new file mode 100644 index 00000000..412aed3b --- /dev/null +++ b/etl/lodgement/requirements.txt @@ -0,0 +1,14 @@ +PyPDF2 +pandas +tqdm +openpyxl +boto3 +usaddress==0.5.11 +fuzzywuzzy==0.18.0 +python-dotenv +python-docx +pymupdf +pytesseract +pdf2image +pillow +pdfplumber diff --git a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py new file mode 100644 index 00000000..3bd87a8c --- /dev/null +++ b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py @@ -0,0 +1,240 @@ +import os +import pandas as pd +import numpy as np +from tqdm import tqdm +from dotenv import load_dotenv +from urllib.parse import urlencode +from epc_api.client import EpcClient +from utils.logger import setup_logger +from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes + +from recommendations.recommendation_utils import ( + estimate_perimeter, + estimate_external_wall_area, + estimate_number_of_floors +) + +logger = setup_logger() +load_dotenv(dotenv_path="backend/.env") +EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN") + +CONFIG = [ + { + "filepath": "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/Surveyors Sites for Week Commencing " + "11.11.2024.xlsx", + "tab": "SETTLE GBIS x 242 ", + "postcode_column": "Postcode", + }, + { + "filepath": "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/Surveyors Sites for Week Commencing " + "11.11.2024.xlsx", + "tab": "ACIS GBIS x 76", + "postcode_column": "Postcode", + }, + { + "filepath": "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/Surveyors Sites for Week Commencing " + "11.11.2024.xlsx", + "tab": "SOUTHERN GBIS x 150", + "postcode_column": "Postcode", + }, + { + "filepath": "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/Surveyors Sites for Week Commencing " + "11.11.2024.xlsx", + "tab": "COMMUNITY HOUSING GBIS x 199", + "postcode_column": "Postcode", + }, + { + "filepath": "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/Surveyors Sites for Week Commencing " + "11.11.2024.xlsx", + "tab": "EASTLIGHT GBIS x 42", + "postcode_column": "Postcode", + }, +] + +CAVITY_WALL_DESCRIPTIONS = [ + "Cavity wall, as built, no insulation (assumed)", + "Cavity wall, as built, partial insulation (assumed)", + "Cavity wall, as built, insulated (assumed)", + "Cavity wall, with internal insulation", + "Cavity wall, with external insulation", +] + +ROOF_DESCRIPTIONS = [ + "Pitched, no insulation", + "Pitched, no insulation (assumed)", + "Pitched, 25 mm loft insulation", + "Pitched, 50 mm loft insulation", + "Pitched, 75 mm loft insulation", + "Pitched, 100 mm loft insulation", + "Pitched, 150 mm loft insulation", + "Pitched, limited insulation (assumed)", + "Pitched, insulated (assumed)", +] + +SOCIAL_TENURES = ["Rented (social)", "rental (social)"] + + +def process_postcode_epcs(postcode, client): + params = {"postcode": postcode.rstrip().lstrip()} + url = os.path.join(client.domestic.host, "search") + "?" + urlencode({"size": 1000}) + response = client.domestic.call(method="get", url=url, params=params) + if "rows" not in response: + logger.warning("No EPCs found for postcode %s", postcode) + return pd.DataFrame() + postcode_epcs = pd.DataFrame(response["rows"]) + + # Processing code here + postcode_epcs["uprn"] = np.where( + pd.isnull(postcode_epcs["uprn"]), + postcode_epcs["address"], + postcode_epcs["uprn"] + ) + postcode_epcs = postcode_epcs.sort_values("lodgement-date", ascending=False) + postcode_epcs = postcode_epcs.drop_duplicates("uprn", keep="first") + return postcode_epcs + + +def filter_and_prepare_epcs(epcs): + epcs["Is Cavity Property"] = epcs["walls-description"].isin(CAVITY_WALL_DESCRIPTIONS) & ( + epcs["current-energy-efficiency"].astype(int) <= 72 + ) + epcs["Solar and Loft"] = ( + epcs["roof-description"].isin(ROOF_DESCRIPTIONS) + ) & ( + epcs["photo-supply"].isin(["0", "", "0.0"]) + ) & ( + epcs["current-energy-efficiency"].astype(int) <= 68 + ) + epcs = epcs[epcs["Is Cavity Property"] | epcs["Solar and Loft"]] + epcs = epcs[~epcs["tenure"].isin(SOCIAL_TENURES)] + return epcs + + +def rename_and_add_columns(epcs): + # Retrieve just the data we need + epcs = epcs[ + [ + "uprn", + "address", + "postcode", + "property-type", + "built-form", + "inspection-date", + "current-energy-rating", + "current-energy-efficiency", + "roof-description", + "walls-description", + "transaction-type", + # New fields needed + "secondheat-description", + "total-floor-area", + "construction-age-band", + "floor-height", + "number-habitable-rooms", + "mainheat-description", + # + "energy-consumption-current", # kwh/m2 + "tenure", + "Is Cavity Property", + "Solar and Loft", + ] + ] + + epcs = epcs.rename( + columns={ + "address": "Address", + "postcode": "Postcode", + "inspection-date": "Date of last EPC", + "current-energy-efficiency": "SAP score on register", + "current-energy-rating": "EPC rating on register", + "property-type": "Property Type", + "built-form": "Archetype", + "total-floor-area": "Property Floor Area", + "construction-age-band": "Property Age Band", + "floor-height": "Property Floor Height", + "number-habitable-rooms": "Number of Habitable Rooms", + "walls-description": "Wall Construction", + "roof-description": "Roof Construction", + "mainheat-description": "Heating Type", + "secondheat-description": "Secondary Heating", + "transaction-type": "Reason for last EPC", + "energy-consumption-current": "Heat Demand (kWh/m2)", + "tenure": "Tenure" + } + ) + + epcs["Number of Habitable Rooms"] = epcs["Number of Habitable Rooms"].astype(int) + epcs["Property Floor Area"] = epcs["Property Floor Area"].astype(float) + + # Add additional columns as in your original code + epcs["Estimated Number of Floors"] = epcs.apply( + lambda x: estimate_number_of_floors(x["Property Type"]) if pd.notnull(x["Property Type"]) else None, axis=1 + ) + + epcs["Estimated Perimeter (m)"] = epcs.apply( + lambda x: estimate_perimeter( + x["Property Floor Area"] / x["Estimated Number of Floors"], + x["Number of Habitable Rooms"] / x["Estimated Number of Floors"] + ), axis=1 + ) + epcs["Estimated Heat Loss Perimeter (m2)"] = epcs.apply( + lambda x: estimate_external_wall_area( + x["Estimated Number of Floors"], + float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.4, + x["Estimated Perimeter (m)"], + x["Archetype"] + ), axis=1 + ) + epcs["Roof Insulation Thickness"] = epcs.apply( + lambda x: RoofAttributes(description=x["Roof Construction"]).process()[ + "insulation_thickness"] if pd.notnull(x["Roof Construction"]) else None, + axis=1 + ) + return epcs + + +def main(): + """ + This application is used to identify additional units that are private rentals or owner occupies that can be + included in the route marches + + Required inputs are the following: + - An excel file that contains one or many tabs that include the addresses to be visited + """ + + # This should be set: + output_filepath = ( + "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/PRS and OO properties - WC 11.11.2024.xlsx" + ) + client = EpcClient(auth_token=EPC_AUTH_TOKEN) + writer = pd.ExcelWriter(output_filepath, engine="xlsxwriter") + + for config in CONFIG: + logger.info("Processing %s", config["tab"]) + # Read in the data + route_march_addresses = pd.read_excel( + config["filepath"], + sheet_name=config["tab"], + engine="openpyxl" + ) + + postcodes = route_march_addresses[config["postcode_column"]].unique() + + epcs = [] + for postcode in tqdm(postcodes): + postcode_epcs = process_postcode_epcs(postcode, client) + if postcode_epcs.empty: + continue + epcs.append(postcode_epcs) + + # Concatenate all postcodes' data and filter it + epcs = pd.concat(epcs) + epcs = filter_and_prepare_epcs(epcs) + epcs = rename_and_add_columns(epcs) + + sheet_name = config["tab"][:31] # Excel sheet names max length of 31 characters + epcs.to_excel(writer, sheet_name=sheet_name, index=False) + + # Save and close the writer outside the loop + writer.close() + logger.info("Data successfully written to %s", output_filepath) diff --git a/etl/route_march/oo_prs_additional_units/requirements.txt b/etl/route_march/oo_prs_additional_units/requirements.txt new file mode 100644 index 00000000..e2f4832c --- /dev/null +++ b/etl/route_march/oo_prs_additional_units/requirements.txt @@ -0,0 +1,10 @@ +openpyxl +epc-api-python==1.0.2 +numpy==2.1.2 +pandas==2.2.3 +usaddress==0.5.11 +fuzzywuzzy==0.18.0 +boto3==1.35.44 +python-dotenv +tqdm +xlsxwriter \ No newline at end of file diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py index ffe191a4..ef8daf51 100644 --- a/etl/xml_survey_extraction/XmlParser.py +++ b/etl/xml_survey_extraction/XmlParser.py @@ -9,7 +9,8 @@ from etl.xml_survey_extraction.pcdb import heating_data PROPERTY_TYPE_LOOKUP = { "0": "House", "House": "House", - "2": "Flat" + "2": "Flat", + "3": "Maisonette", } @@ -107,11 +108,13 @@ class XmlParser: BUILT_FORM_MAP = { "1": "Detached", + "2": "Semi-Detached", "3": "End-Terrace", "4": "Mid-Terrace", } GLAZED_AREA_MAP = { + "2": "More than Typical", "4": "Much More Than Typical" } @@ -120,7 +123,9 @@ class XmlParser: } TRANSACTION_TYPE_MAP = { - "13": "ECO assessment" + "5": "Rented (social)", + "13": "ECO assessment", + "14": "Stock condition survey", } TENURE_MAP = { @@ -131,7 +136,8 @@ class XmlParser: TARIFF_MAP = { "1": "Dual", - "2": "Single" + "2": "Single", + "3": "Unknown" } def __init__(self, file, filekey, surveyor_company, uprn=None): @@ -400,8 +406,13 @@ class XmlParser: ] wall_areas = sum([float(f["heat_loss_perimeter"]) * float(f["room_height"]) for f in main_dwelling_floors]) - window_areas = sum([float(w["window_area"]) for w in main_dwelling_windows]) - return wall_areas - window_areas + window_areas = [float(w["window_area"]) for w in main_dwelling_windows if w["window_area"] is not None] + if not window_areas: + # We discount 10% of the wall area + insulation_wall_area = wall_areas * 0.9 + else: + insulation_wall_area = wall_areas - sum(window_areas) + return insulation_wall_area def extract_additional_data(self): @@ -415,7 +426,8 @@ class XmlParser: main_dwelling_windows = [w for w in self.windows if w["window_location"] == "0"] number_of_windows = len(main_dwelling_windows) - windows_area = sum([float(w["window_area"]) for w in main_dwelling_windows]) + windows_area = [float(w["window_area"]) for w in main_dwelling_windows if w["window_area"] is not None] + windows_area = sum(windows_area) if windows_area else None boolean_lookup = { "true": True, @@ -427,6 +439,7 @@ class XmlParser: cylinder_insulation_type = { None: "", "1": "Foam", + "2": "Jacket" } cylinder_insulation_thickness = int( @@ -461,7 +474,7 @@ class XmlParser: "cylinder_thermostat": cylinder_thermostat, "main_dwelling_ground_floor_area": float(main_dwelling_ground_floor_area), "number_of_windows": int(number_of_windows), - "windows_area": float(windows_area), + "windows_area": float(windows_area) if windows_area is not None else windows_area, } def get_node_value(self, tag_name): @@ -769,9 +782,10 @@ class XmlParser: :return: """ - sap_windows = self.xml.getElementsByTagName("SAP-Windows")[0].getElementsByTagName("SAP-Window") - glazing_type_lookup = { + "ND": "Single glazing", + "1": "double glazing installed before 2002", + "2": "double glazing installed during or after 2002", "3": "double glazing, unknown install date", "5": "Single glazing", } @@ -787,6 +801,40 @@ class XmlParser: "8": "North West" } + sap_windows = self.xml.getElementsByTagName("SAP-Windows") + + if not sap_windows: + # We look for Multi-Glazed-Proportion + multiple_glazing_type = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName( + "Multiple-Glazing-Type" + )[0].firstChild.nodeValue + + pvc_frame = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName( + "PVC-Window-Frames" + ) + + pvc_frame = pvc_frame[0].firstChild.nodeValue if pvc_frame else None + + multple_glazed_proportion = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName( + "Multiple-Glazed-Proportion" + )[0].firstChild.nodeValue + + self.windows = [ + { + "window_location": "0", + "window_area": None, + "window_type": None, + "glazing_type": glazing_type_lookup[multiple_glazing_type], + "pvc_frame": pvc_frame, + "glazing_gap": None, + "orientation": None, + "multple_glazed_proportion": multple_glazed_proportion + } + ] + return + + sap_windows = sap_windows[0].getElementsByTagName("SAP-Window") + self.windows = [ self._parse_windows_content( window=window, diff --git a/input_property_list.csv b/input_property_list.csv deleted file mode 100644 index dc677c88..00000000 --- a/input_property_list.csv +++ /dev/null @@ -1,12 +0,0 @@ -address,postcode,Notes,,,, -28 Distillery Wharf,W6 9bf,,,,, -Flat 14 Godley V C House,E2 0LP,,,,, -49 Elderfield Road,E5 0LF,,,,, -26 Stanhope Road,N6 5NG,,,,, -Flat 3 Frederick Building,N1 4BD,,,,, -Flat 4 Frederick Building,N1 4BD,,,,, -"Flat 28, 22 Adelina Grove",E1 3BX,,,,, -"Flat 39, 239 Long Lane",SE1 4PT,,,,, -"1, Westview, Somerby",LE14 2QH,This property has an unfilled cavity,,,, -"59, Ashdale",CM23 4EB,This property has a partially filled cavity,,,, -88 Cleveland Avenue,DL3 7BE,This property has a filled cavity,,,, \ No newline at end of file diff --git a/keyzy_pilot.csv b/keyzy_pilot.csv deleted file mode 100644 index b972bcf9..00000000 --- a/keyzy_pilot.csv +++ /dev/null @@ -1,3 +0,0 @@ -address,postcode,Notes,,,, -2 South Terrace,NN1 5JY,,,,, -25 Albert Street,PO12 4TY,,,,, \ No newline at end of file diff --git a/recommendations/Costs.py b/recommendations/Costs.py index 5554245f..2d486191 100644 --- a/recommendations/Costs.py +++ b/recommendations/Costs.py @@ -37,22 +37,25 @@ MCS_SOLAR_PV_COST_DATA = { "average_cost_per_kwh-Northern Ireland": 1347, } +# Installers are now working with 435 watt panels +PANEL_SIZE = 0.435 + INSTALLER_SOLAR_COSTS = [ - {'n_panels': 4, 'array_kwp': 1.6, 'cost': 3040.00, 'installer': 'CEG'}, - {'n_panels': 5, 'array_kwp': 2.1, 'cost': 3201.00, 'installer': 'CEG'}, - {'n_panels': 6, 'array_kwp': 2.5, 'cost': 3363.00, 'installer': 'CEG'}, - {'n_panels': 7, 'array_kwp': 2.9, 'cost': 3524.00, 'installer': 'CEG'}, - {'n_panels': 8, 'array_kwp': 3.3, 'cost': 3686.00, 'installer': 'CEG'}, - {'n_panels': 9, 'array_kwp': 3.7, 'cost': 3847.00, 'installer': 'CEG'}, - {'n_panels': 10, 'array_kwp': 4.1, 'cost': 4009.00, 'installer': 'CEG'}, - {'n_panels': 11, 'array_kwp': 4.5, 'cost': 4170.00, 'installer': 'CEG'}, - {'n_panels': 12, 'array_kwp': 4.9, 'cost': 4332.00, 'installer': 'CEG'}, - {'n_panels': 13, 'array_kwp': 5.3, 'cost': 4835.00, 'installer': 'CEG'}, - {'n_panels': 14, 'array_kwp': 5.7, 'cost': 5015.00, 'installer': 'CEG'}, - {'n_panels': 15, 'array_kwp': 6.2, 'cost': 5176.00, 'installer': 'CEG'}, - {'n_panels': 16, 'array_kwp': 6.6, 'cost': 5338.00, 'installer': 'CEG'}, - {'n_panels': 17, 'array_kwp': 7.0, 'cost': 5500.00, 'installer': 'CEG'}, - {'n_panels': 18, 'array_kwp': 7.4, 'cost': 6021.00, 'installer': 'CEG'} + {'n_panels': 4, 'array_kwp': 4 * PANEL_SIZE, 'cost': 4089.25, 'installer': 'CEG'}, + {'n_panels': 5, 'array_kwp': 5 * PANEL_SIZE, 'cost': 4242.48, 'installer': 'CEG'}, + {'n_panels': 6, 'array_kwp': 6 * PANEL_SIZE, 'cost': 4395.71, 'installer': 'CEG'}, + {'n_panels': 7, 'array_kwp': 7 * PANEL_SIZE, 'cost': 4548.94, 'installer': 'CEG'}, + {'n_panels': 8, 'array_kwp': 8 * PANEL_SIZE, 'cost': 4702.17, 'installer': 'CEG'}, + {'n_panels': 9, 'array_kwp': 9 * PANEL_SIZE, 'cost': 4855.41, 'installer': 'CEG'}, + {'n_panels': 10, 'array_kwp': 10 * PANEL_SIZE, 'cost': 5010.95, 'installer': 'CEG'}, + {'n_panels': 11, 'array_kwp': 11 * PANEL_SIZE, 'cost': 5166.49, 'installer': 'CEG'}, + {'n_panels': 12, 'array_kwp': 12 * PANEL_SIZE, 'cost': 5322.04, 'installer': 'CEG'}, + {'n_panels': 13, 'array_kwp': 13 * PANEL_SIZE, 'cost': 5657.6, 'installer': 'CEG'}, + {'n_panels': 14, 'array_kwp': 14 * PANEL_SIZE, 'cost': 5993.16, 'installer': 'CEG'}, + {'n_panels': 15, 'array_kwp': 15 * PANEL_SIZE, 'cost': 6328.71, 'installer': 'CEG'}, + {'n_panels': 16, 'array_kwp': 16 * PANEL_SIZE, 'cost': 6483.33, 'installer': 'CEG'}, + {'n_panels': 17, 'array_kwp': 17 * PANEL_SIZE, 'cost': 6637.95, 'installer': 'CEG'}, + {'n_panels': 18, 'array_kwp': 18 * PANEL_SIZE, 'cost': 6792.57, 'installer': 'CEG'} ] # This is the maximum number of panels that we have a cost from the installers for INSTALLER_MAX_PANELS = 18 @@ -62,11 +65,11 @@ INSTALLER_MAX_PANELS = 18 INSTALLER_SOLAR_PV_INVERTER_COST = 7500 INSTALLER_SOLAR_PV_INVERTER_LABOUR_COST = 500 # Just a rough guess to labour costs -INSTALLER_SCAFFOLDING_COSTS = [ - {'stories': 1, 'description': '1 Story Scaffold', 'cost': 531.00, 'installer': 'CEG'}, - {'stories': 2, 'description': '2 Story Scaffold', 'cost': 841.00, 'installer': 'CEG'}, - {'stories': 3, 'description': '3 Story Scaffold', 'cost': 1077.00, 'installer': 'CEG'} -] +# INSTALLER_SCAFFOLDING_COSTS = [ +# {'stories': 1, 'description': '1 Story Scaffold', 'cost': 531.00, 'installer': 'CEG'}, +# {'stories': 2, 'description': '2 Story Scaffold', 'cost': 841.00, 'installer': 'CEG'}, +# {'stories': 3, 'description': '3 Story Scaffold', 'cost': 1077.00, 'installer': 'CEG'} +# ] # This data is based on the MCS database, We use the larger figure between the 2023 and 2024 average, # to be conservative @@ -101,10 +104,10 @@ INSTALLER_ASHP_COSTS = [ BOILER_UPGRADE_SCHEME_ASHP_VALUE = 7500 INSTALLER_SOLAR_BATTERY_COSTS = [ - {'capacity_kwh': 5, 'description': 'Battery Add on', 'cost': 2700.00, 'installer': 'CEG'}, - {'capacity_kwh': 10, 'description': 'Battery Add on', 'cost': 4300.00, 'installer': 'CEG'}, - {'capacity_kwh': 5, 'description': 'Battery Retrofit existing system', 'cost': 4250.00, 'installer': 'CEG'}, - {'capacity_kwh': 10, 'description': 'Battery Retrofit Existing system', 'cost': 5950.00, 'installer': 'CEG'} + {'capacity_kwh': 5, 'description': 'Battery Add on', 'cost': 3769.89, 'installer': 'JJC'}, + # {'capacity_kwh': 10, 'description': 'Battery Add on', 'cost': 4300.00, 'installer': 'CEG'}, + # {'capacity_kwh': 5, 'description': 'Battery Retrofit existing system', 'cost': 4250.00, 'installer': 'CEG'}, + # {'capacity_kwh': 10, 'description': 'Battery Retrofit Existing system', 'cost': 5950.00, 'installer': 'CEG'} ] # This is based on https://www.checkatrade.com/blog/cost-guides/cost-smart-thermostat/ @@ -149,7 +152,7 @@ CONDENSING_BOILER_COSTS = { ELECTRIC_BOILER_COSTS = 1800 # Assumes 1 hours to remove each heater (including re-decorating) -ROOM_HEATER_REMOVAL_COST = 50 +ROOM_HEATER_REMOVAL_COST = 25 ROOM_HEATER_REMOVAL_LABOUR_HOURS = 3 # This is a cost quoted by Jim for a system flush - existig system will run more efficiently @@ -190,6 +193,8 @@ class Costs: # fittings and trimming doors, as well as scope for damage to the existing wall during preparation. IWI_CONTINGENCY = 0.2 + # For air source heat pumps, we inflate the assume cost by quite a bit to account for design and installation + ASHP_CONTINGENCY = 0.35 # Where there is more uncertainty, a higher contingency rate is used HIGH_RISK_CONTINGENCY = 0.2 # When there is less uncertainty, a lower contingency rate is used @@ -234,6 +239,13 @@ class Costs: if self.region is None: # Try and grab using the local-authority-label self.region = county_to_region_map.get(self.property.data["local-authority-label"], None) + + if self.region is None: + # Try and get the region after converting the keys to lower + self.region = { + k.lower(): v for k, v in county_to_region_map.items() + }.get(self.property.data["local-authority-label"].lower(), None) + if self.region is None: raise ValueError("Region not found in county map") @@ -719,8 +731,9 @@ class Costs: "labour_days": labour_days } + @classmethod def solar_pv( - self, + cls, n_panels: int | float, has_battery: bool = False, array_cost=None, @@ -758,33 +771,28 @@ class Costs: else: system_cost = [c for c in INSTALLER_SOLAR_COSTS if c["n_panels"] == n_panels][0]["cost"] - total_cost = array_cost if array_cost is not None else system_cost + subtotal = array_cost if array_cost is not None else system_cost if has_battery: battery_cost = [c for c in INSTALLER_SOLAR_BATTERY_COSTS if c["capacity_kwh"] == battery_kwh][0]["cost"] - total_cost += battery_cost - - scaffolding_cost = [c for c in INSTALLER_SCAFFOLDING_COSTS if c["stories"] == n_floors][0]["cost"] - total_cost += scaffolding_cost + subtotal += battery_cost if needs_inverter: - total_cost += INSTALLER_SOLAR_PV_INVERTER_COST + subtotal += INSTALLER_SOLAR_PV_INVERTER_COST # We also add an additional labour cost - total_cost += INSTALLER_SOLAR_PV_INVERTER_LABOUR_COST + subtotal += INSTALLER_SOLAR_PV_INVERTER_LABOUR_COST - # We add an additional cost for scaffolding - - subtotal_before_vat = total_cost / (1 + self.VAT_RATE) - - vat = total_cost - subtotal_before_vat + # Solar doesn't have VAT but we add a high risk contingency + # to account for design variation that we see in practice + total_cost = subtotal * (1 + cls.HIGH_RISK_CONTINGENCY) # Labour hours are based on estimates from online research but an average team seems to consist of 3 people # and most jobs take around 2 days. Assuming an 8 hour day for 3 people across 2 days, gives us 48 hours of # labour return { "total": total_cost, - "subtotal": subtotal_before_vat, - "vat": vat, + "subtotal": subtotal, + "vat": 0, "labour_hours": 48, "labour_days": 2, } @@ -1154,7 +1162,6 @@ class Costs: pump. This cost will include the boiler upgrade scheme grant """ - # This is the average cost of a project, we'll add some additional contingency if ashp_size is None: @@ -1163,9 +1170,10 @@ class Costs: cost = [x for x in INSTALLER_ASHP_COSTS if x][0]["cost"] # We add some contingency since there are additional costs such as resizing radiators, that could be required - total_cost = cost * (1 + self.CONTINGENCY) - subtotal_before_vat = total_cost / (1 + self.VAT_RATE) - vat = total_cost - subtotal_before_vat + subtotal = cost * (1 + self.ASHP_CONTINGENCY) + # The costs from installers exclude VAT + vat = subtotal * self.VAT_RATE + total_cost = subtotal + vat # We assume 5 days installation labour_days = 5 @@ -1173,7 +1181,7 @@ class Costs: return { "total": total_cost, - "subtotal": subtotal_before_vat, + "subtotal": subtotal, "vat": vat, "labour_hours": labour_hours, "labour_days": labour_days, diff --git a/recommendations/DraughtProofingRecommendations.py b/recommendations/DraughtProofingRecommendations.py index 4bd85a03..a16a94f6 100644 --- a/recommendations/DraughtProofingRecommendations.py +++ b/recommendations/DraughtProofingRecommendations.py @@ -26,6 +26,9 @@ class DraughtProofingRecommendations: if not draught_proofing_recommendation_config: return + # Cost is based on a £50 cost per window, based on Checkatrade + cost = draught_proofing_recommendation_config.get("cost", self.property.number_of_windows * 50) + description = ( "Draught proof doors and windows to improve energy efficiency" if not draught_proofing_recommendation_config.get("description") @@ -48,7 +51,7 @@ class DraughtProofingRecommendations: "kwh_savings": 0, "co2_equivalent_savings": 0, "energy_cost_savings": 0, - "total": draught_proofing_recommendation_config["cost"], + "total": cost, # We use a very simple and rough estimate of 4 hours per unit "labour_hours": draught_proofing_recommendation_config.get("labour_hours", 8), "labour_days": draught_proofing_recommendation_config.get("labour_days", 1), # Assume 8 hour day diff --git a/recommendations/FloorRecommendations.py b/recommendations/FloorRecommendations.py index 25741e7a..85e1a8dc 100644 --- a/recommendations/FloorRecommendations.py +++ b/recommendations/FloorRecommendations.py @@ -145,7 +145,9 @@ class FloorRecommendations(Definitions): ) return - raise NotImplementedError("Implement me!") + # In this case, we have no recommendation to make. E.g., if we have a solid floor property + # but solid floor insulation has been excluded as a measure, we get here + return @staticmethod def _make_floor_description(material): @@ -172,6 +174,11 @@ class FloorRecommendations(Definitions): insulation_materials = pd.DataFrame(insulation_materials) + non_invasive_recs = next( + (r for r in self.property.non_invasive_recommendations if + r["type"] == insulation_materials["type"].values[0]), {} + ) + lowest_selected_u_value = None for _, insulation_material_group in insulation_materials.groupby("description"): @@ -217,6 +224,9 @@ class FloorRecommendations(Definitions): else: raise NotImplementedError("Implement me!") + sap_points = non_invasive_recs.get("sap_points", None) + survey = non_invasive_recs.get("survey", False) + floor_ending_config = FloorAttributes(new_description).process() floor_simulation_config = check_simulation_difference( new_config=floor_ending_config, old_config=self.property.floor, prefix="floor_" @@ -245,7 +255,8 @@ class FloorRecommendations(Definitions): "description": self._make_floor_description(material), "starting_u_value": u_value, "new_u_value": new_u_value, - "sap_points": None, + "sap_points": sap_points, + "survey": survey, "already_installed": already_installed, "simulation_config": simulation_config, "description_simulation": { diff --git a/recommendations/HeatingControlRecommender.py b/recommendations/HeatingControlRecommender.py index c613aa42..bd015a79 100644 --- a/recommendations/HeatingControlRecommender.py +++ b/recommendations/HeatingControlRecommender.py @@ -12,7 +12,7 @@ class HeatingControlRecommender: self.recommendation = [] - def recommend(self, heating_description, description_prefix="", description_suffix=""): + def recommend(self, heating_description, phase, description_prefix="", description_suffix=""): # TODO: Many of these functions are quite similar. We can possibly create a single wrapper function that # takes in the heating description and the description prefix/suffix, and then creates the appropriate @@ -23,32 +23,32 @@ class HeatingControlRecommender: # This first iteration of the recommender will provide very basic recommendation # We recommend heating controls based on the main heating system if heating_description in ["Room heaters, electric"]: - self.recommend_room_heaters_electric_controls() + self.recommend_room_heaters_electric_controls(phase=phase) return if heating_description in ["Electric storage heaters", "Electric storage heaters, radiators"]: - self.recommend_high_heat_retention_controls(description_prefix=description_prefix) + self.recommend_high_heat_retention_controls(description_prefix=description_prefix, phase=phase) return if heating_description in ["Boiler and radiators, mains gas"]: # We can recommend roomstat programmer trvs - self.recommend_roomstat_programmer_trvs(description_suffix=description_suffix) + self.recommend_roomstat_programmer_trvs(description_suffix=description_suffix, phase=phase) # We can also recommend time and temperature zone controls - self.recommend_time_temperature_zone_controls(description_suffix=description_suffix) + self.recommend_time_temperature_zone_controls(description_suffix=description_suffix, phase=phase) return if heating_description in ["Boiler and radiators, electric"]: - self.recommend_roomstat_programmer_trvs() + self.recommend_roomstat_programmer_trvs(phase=phase) return if heating_description in ["Air source heat pump, radiators, electric"]: # For an ASHP, we can recommend time and temperature zone controls, as well as programmer, trvs and a bypass # which are common configurations for ASHPs - self.recommend_time_temperature_zone_controls() + self.recommend_time_temperature_zone_controls(phase=phase) # self.recommend_programmer_trvs_bypass() - def recommend_room_heaters_electric_controls(self): + def recommend_room_heaters_electric_controls(self, phase): """ If the home has Room heaters, electric, we start by identifying potential heating controls that could be upgraded, that would provide a practical impact. This will be the least invasive improvement. @@ -88,6 +88,9 @@ class HeatingControlRecommender: self.recommendation.append( { + "phase": phase, + "type": "heating", + "measure_type": "programmer_appliance_thermostat", "description": "upgrade heating controls to Programmer and Appliance or Smart Thermostats", **self.costs.programmer_and_appliance_thermostat(has_programmer=has_programmer), "simulation_config": simulation_config @@ -97,7 +100,7 @@ class HeatingControlRecommender: # We don't implement any other recommendations right now return - def recommend_high_heat_retention_controls(self, description_prefix=""): + def recommend_high_heat_retention_controls(self, phase, description_prefix=""): """ When applicable, we recommend upgrading the heating controls to high heat retention controls. This is a specific type of control system that is designed to work with electric storage heaters. It is a more @@ -133,6 +136,9 @@ class HeatingControlRecommender: self.recommendation.append( { + "phase": phase, + "type": "heating", + "measure_type": "celect_type_controls", "description": "Upgrade heating controls to High Heat Retention Storage Heater Controls", **self.costs.celect_type_controls(), "simulation_config": simulation_config, @@ -143,7 +149,7 @@ class HeatingControlRecommender: # We don't implement any other recommendations right now return - def recommend_roomstat_programmer_trvs(self, description_suffix=""): + def recommend_roomstat_programmer_trvs(self, phase, description_suffix=""): """ If the home has a boiler and radiators, mains gas, we start by identifying potential heating controls that could be upgraded, that would provide a practical impact. @@ -208,15 +214,16 @@ class HeatingControlRecommender: description = "Upgrade heating controls to Room thermostat, programmer and TRVs" - already_installed = "heating_control" in self.property.already_installed + already_installed = "roomstat_programmer_trvs" in self.property.already_installed if already_installed: cost_result = override_costs(cost_result) description = "Heating controls have already been upgraded, no further action needed." self.recommendation.append( { - "type": "heating_control", + "type": "heating", "measure_type": "roomstat_programmer_trvs", + "phase": phase, "parts": [], "description": description, **cost_result, @@ -231,7 +238,7 @@ class HeatingControlRecommender: return - def recommend_time_temperature_zone_controls(self, description_suffix=""): + def recommend_time_temperature_zone_controls(self, phase, description_suffix=""): """ If the home has a boiler, we can recommend time and temperature zone controls. This is a more advanced and more efficient control system than the standard controls that come with a boiler. However, it may come @@ -282,14 +289,15 @@ class HeatingControlRecommender: "temperature zone control)" ) - already_installed = "heating_control" in self.property.already_installed + already_installed = "time_temperature_zone_control" in self.property.already_installed if already_installed: cost_result = override_costs(cost_result) description = "Heating controls have already been upgraded, no further action needed." self.recommendation.append( { - "type": "heating_control", + "type": "heating", + "phase": phase, "measure_type": "time_temperature_zone_control", "parts": [], "description": description, @@ -335,14 +343,15 @@ class HeatingControlRecommender: description = "Install a Bypass valve, TRVs and a Programmer" - already_installed = "heating_control" in self.property.already_installed + already_installed = "programmer_trvs_bypass" in self.property.already_installed if already_installed: cost_result = override_costs(cost_result) description = "Heating controls have already been upgraded, no further action needed." self.recommendation.append( { - "type": "heating_control", + "type": "heating", + "measure_type": "programmer_trvs_bypass", "parts": [], "description": description, **cost_result, diff --git a/recommendations/HeatingRecommender.py b/recommendations/HeatingRecommender.py index 7dc4f8b2..20f5e7ad 100644 --- a/recommendations/HeatingRecommender.py +++ b/recommendations/HeatingRecommender.py @@ -65,7 +65,6 @@ class HeatingRecommender: self.costs = Costs(self.property) self.heating_recommendations = [] - self.heating_control_recommendations = [] self.has_electric_heating_description = ( self.property.main_heating["has_electric"] or self.property.main_heating["has_electricaire"] @@ -259,7 +258,6 @@ class HeatingRecommender: "ashp_only_heating_recommendation", False ) self.heating_recommendations = [] - self.heating_control_recommendations = [] # This first iteration of the recommender will provide very basic recommendation # We recommend heating controls based on the main heating system @@ -302,7 +300,6 @@ class HeatingRecommender: self.recommend_air_source_heat_pump( phase=phase, has_cavity_or_loft_recommendations=has_cavity_or_loft_recommendations, - ) return @@ -360,7 +357,7 @@ class HeatingRecommender: } controls_recommender = HeatingControlRecommender(self.property) - controls_recommender.recommend(heating_description="Boiler and radiators, electric") + controls_recommender.recommend(heating_description="Boiler and radiators, electric", phase=phase) self.heating_recommendations.extend([boiler_recommendation] + controls_recommender.recommendation) return @@ -453,7 +450,7 @@ class HeatingRecommender: ), {}) controls_recommender = HeatingControlRecommender(self.property) - controls_recommender.recommend(heating_description="Air source heat pump, radiators, electric") + controls_recommender.recommend(heating_description="Air source heat pump, radiators, electric", phase=phase) ashp_size = self.size_heat_pump() ashp_costs = self.costs.air_source_heat_pump(ashp_size) @@ -631,7 +628,8 @@ class HeatingRecommender: heating_controls_only, system_change, system_type, - measure_type + measure_type, + non_intrusive_recommendation=None ): """ Given a recommendation for heating controls, and a recommendation for the heating system, we combine the two @@ -649,8 +647,13 @@ class HeatingRecommender: :param system_type: The type of heating system we are recommending :param measure_type: The type of measure we are recommending - more granular than the "type" field, allowing us to distinguish between different types of heating recommendations + :param non_intrusive_recommendation: A non-intrusive recommendation, which may specify the number of SAP points + or a cost for this recommendation """ + if non_intrusive_recommendation is None: + non_intrusive_recommendation = {} + # We produce recommendations with & without heating controls # We will also produce a recommendation for heating controls only heating_controls_switch = [True, False] if controls_recommendations else [False] @@ -698,13 +701,14 @@ class HeatingRecommender: "description": recommendation_description, "starting_u_value": None, "new_u_value": None, - "sap_points": None, + "sap_points": non_intrusive_recommendation.get("sap_points"), "already_installed": already_installed, **total_costs, "simulation_config": recommendation_simulation_config, "description_simulation": recommendation_description_simulation, # We insert the heating system type here - "system_type": system_type + "system_type": system_type, + "survey": non_intrusive_recommendation.get("survey", False) } output.append(recommendation) @@ -798,7 +802,9 @@ class HeatingRecommender: description_prefix = "" controls_recommender.recommend( - heating_description="Electric storage heaters", description_prefix=description_prefix + heating_description="Electric storage heaters", + description_prefix=description_prefix, + phase=phase ) has_hhr = self.is_hhr_already_installed() @@ -807,6 +813,13 @@ class HeatingRecommender: # No recommendation needed return + # We check if there is a high heat retention non-intrusive recommendation + non_intrusive_recommendation = next( + (r for r in self.property.non_invasive_recommendations if + r["type"] == "high_heat_retention_storage_heater"), + {} + ) + # We check if the property has dual heating in place with a boiler and storage heaters if self.dual_heating: new_heating_description = self.DUAL_HEATING_DESCRIPTIONS[ @@ -838,6 +851,8 @@ class HeatingRecommender: else: heating_simulation_config["mainheat_energy_eff_ending"] = self.property.data["mainheat-energy-eff"] + # TODO:We possibly shouldn't touch the hot water energy efficiency if we aren't recommending dual immersion + # we'll keep this for the moment though if self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor"]: heating_simulation_config["hot_water_energy_eff_ending"] = "Average" else: @@ -895,7 +910,8 @@ class HeatingRecommender: heating_controls_only=heating_controls_only, system_change=system_change, system_type="high_heat_retention_storage_heater", - measure_type="high_heat_retention_storage_heater" + measure_type="high_heat_retention_storage_heater", + non_intrusive_recommendation=non_intrusive_recommendation ) if _return: return recommendations @@ -978,9 +994,13 @@ class HeatingRecommender: # We check if there's a mains connection and the hot water is inefficient, as this will improve with a boiler has_inefficient_water = ( self.property.data["mains-gas-flag"] and - self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor", "Average"] + self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor"] ) + non_invasive_recommendation = next(( + r for r in self.property.non_invasive_recommendations if r["type"] == "boiler_upgrade" + ), {}) + if has_inefficient_space_heating or has_inefficient_water: boiler_size = self.estimate_boiler_size( property_type=self.property.data["property-type"], @@ -1079,12 +1099,13 @@ class HeatingRecommender: "description": description, "starting_u_value": None, "new_u_value": None, - "sap_points": None, + "sap_points": non_invasive_recommendation.get("sap_points", None), "already_installed": already_installed, "simulation_config": simulation_config, "description_simulation": description_simulation, **boiler_costs, "system_type": "boiler_upgrade", + "survey": non_invasive_recommendation.get("survey", None) } # We recommend the heating controls @@ -1098,10 +1119,10 @@ class HeatingRecommender: description_suffix = "" controls_recommender.recommend( heating_description="Boiler and radiators, mains gas", - description_suffix=description_suffix + description_suffix=description_suffix, + phase=recommendation_phase ) # We may have 2 recommendations from the heating controls - if not controls_recommender.recommendation and not boiler_recommendation: return @@ -1111,6 +1132,8 @@ class HeatingRecommender: if system_change: # We combine the heating and controls recommendations, in the case of a system change + # If this is true, we set SAP points to None and survey to False for the boiler recommendation + combined_recommendations = [] for controls_recommendation in controls_recommender.recommendation: combined_recommendation = self.combine_heating_and_controls( @@ -1137,10 +1160,6 @@ class HeatingRecommender: # 3) Heating controls only # But they are options that are not mutually exclusive # So, we actually set heating controls as a heating recommendation - for recommendation in controls_recommender.recommendation: - recommendation["phase"] = recommendation_phase - # recommendation["type"] = "heating" - - self.heating_control_recommendations.extend(controls_recommender.recommendation) + self.heating_recommendations.extend(controls_recommender.recommendation) return diff --git a/recommendations/HotwaterRecommendations.py b/recommendations/HotwaterRecommendations.py index 636a7be0..d8404cc1 100644 --- a/recommendations/HotwaterRecommendations.py +++ b/recommendations/HotwaterRecommendations.py @@ -20,26 +20,66 @@ class HotwaterRecommendations: :return: """ # Reset the recommendations + recommendations_phase = phase + self.recommendations = [] + non_invasive_recommendations = self.property.non_invasive_recommendations + if non_invasive_recommendations: + measures = [ + r["type"] for r in non_invasive_recommendations if + r["type"] in ["hot_water_tank_insulation", "cylinder_thermostat"] + ] + + for m in measures: + non_invasive_rec = [ + r for r in non_invasive_recommendations if r["type"] == m + ][0] + if m == "hot_water_tank_insulation": + # We need to be able to stack these recommendations + self.recommend_tank_insulation( + phase=recommendations_phase, + sap_points=non_invasive_rec["sap_points"], + survey=non_invasive_rec["survey"], + ) + + recommendations_phase += 1 + elif m == "cylinder_thermostat": + self.recommend_cylinder_thermostat( + phase=recommendations_phase, + sap_points=non_invasive_rec["sap_points"], + survey=non_invasive_rec["survey"], + ) + recommendations_phase += 1 # This first iteration of the recommender will provide very basic recommendation # We recommend heating controls based on the main heating system - # If there is no system present, but access to the mains, we + if self.property.hotwater["clean_description"] == "Gas boiler/circulator, no cylinder thermostat": + # Handle this case specifically: + self.recommend_cylinder_thermostat_gas_boiler_circulator(phase=recommendations_phase) + return + + # If there is no system present, but access to the mains, we + + has_tank_recommendation = [r for r in self.recommendations if r["type"] == "hot_water_tank_insulation"] if ( (self.property.hotwater["heater_type"] in ["electric immersion"]) & (self.property.data["hot-water-energy-eff"] == "Very Poor") & - (self.property.hotwater["no_system_present"] is None) + (self.property.hotwater["no_system_present"] is None) & + (len(has_tank_recommendation) == 0) ): - self.recommend_tank_insulation(phase=phase) + self.recommend_tank_insulation(phase=recommendations_phase) return - if self.property.hotwater["clean_description"] == "From main system, no cylinder thermostat": - self.recommend_cylinder_thermostat(phase=phase) + has_cylinder_recommendation = [r for r in self.recommendations if r["type"] == "cylinder_thermostat"] + + if ((self.property.hotwater["clean_description"] == "From main system, no cylinder thermostat") & + (len(has_cylinder_recommendation) == 0)): + self.recommend_cylinder_thermostat(phase=recommendations_phase) return - def recommend_tank_insulation(self, phase): + def recommend_tank_insulation(self, phase, sap_points=None, survey=False, _return=False): """ If the home has a very poor hot water system, this is often indicative of a lack of insulation on the hot water tank. This is a very simple and cost effective improvement that can be made to the home. It will likely @@ -55,27 +95,30 @@ class HotwaterRecommendations: else: description = "Insulate hot water tank" - self.recommendations.append( - { - "phase": phase, - "parts": [], - "type": "hot_water_tank_insulation", - "measure_type": "hot_water_tank_insulation", - "description": description, - "starting_u_value": None, - "new_u_value": None, - "sap_points": None, - "already_installed": already_installed, - **recommendation_cost, - "simulation_config": {"hot_water_energy_eff_ending": "Poor"}, - "description_simulation": { - "hot-water-energy-eff": "Poor" - } - } - ) + to_append = { + "phase": phase, + "parts": [], + "type": "hot_water_tank_insulation", + "measure_type": "hot_water_tank_insulation", + "description": description, + "starting_u_value": None, + "new_u_value": None, + "sap_points": sap_points, + "already_installed": already_installed, + **recommendation_cost, + "simulation_config": {"hot_water_energy_eff_ending": "Poor"}, + "description_simulation": { + "hot-water-energy-eff": "Poor" + }, + "survey": survey + } + if _return: + return to_append + + self.recommendations.append(to_append) return - def recommend_cylinder_thermostat(self, phase): + def recommend_cylinder_thermostat(self, phase, sap_points=None, survey=False, _return=False): """ If the home has a very poor hot water system, this is often indicative of a lack of insulation on the hot water tank. This is a very simple and cost effective improvement that can be made to the home. @@ -101,23 +144,86 @@ class HotwaterRecommendations: **hotwater_simulation_config } - self.recommendations.append( - { - "phase": phase, - "parts": [], - "type": "cylinder_thermostat", - "measure_type": "cylinder_thermostat", - "description": description, - "starting_u_value": None, - "new_u_value": None, - "sap_points": None, - "already_installed": already_installed, - **recommendation_cost, - "simulation_config": simulation_config, - "description_simulation": { - "hot-water-energy-eff": self.property.data["hot-water-energy-eff"], - "hotwater-description": new_epc_description, - } - } - ) + to_append = { + "phase": phase, + "parts": [], + "type": "cylinder_thermostat", + "measure_type": "cylinder_thermostat", + "description": description, + "starting_u_value": None, + "new_u_value": None, + "sap_points": sap_points, + "already_installed": already_installed, + **recommendation_cost, + "simulation_config": simulation_config, + "description_simulation": { + "hot-water-energy-eff": self.property.data["hot-water-energy-eff"], + "hotwater-description": new_epc_description, + }, + "survey": survey + } + if _return: + return to_append + + self.recommendations.append(to_append) + return + + def recommend_cylinder_thermostat_gas_boiler_circulator(self, phase): + """ + If the home has a very poor hot water system, this is often indicative of a lack of insulation on the + hot water + tank. This is a very simple and cost effective improvement that can be made to the home. + """ + + thermostat_recommendation_cost = self.costs.cylinder_thermostat() + cylinder_recommendation_cost = self.costs.hot_water_tank_insulation() + # Add them + total_cost = { + k: thermostat_recommendation_cost[k] + cylinder_recommendation_cost[k] for k in + thermostat_recommendation_cost.keys() + } + + already_installed = "cylinder_thermostat" in self.property.already_installed + if already_installed: + total_cost = override_costs(total_cost) + description = "Cylinder thermostat & insulation has already been installed, no further action required" + else: + description = "Install a smart cylinder thermostat and insulate the hot water tank with 80mm insulation" + + new_epc_description = "From main system" + hotwater_ending_config = HotWaterAttributes(new_epc_description).process() + hotwater_simulation_config = check_simulation_difference( + new_config=hotwater_ending_config, old_config=self.property.hotwater + ) + + if self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor", "Average"]: + new_efficiency = "Good" + else: + new_efficiency = self.property.data["hot-water-energy-eff"] + + simulation_config = { + "hot_water_energy_eff_ending": new_efficiency, + **hotwater_simulation_config + } + + to_append = { + "phase": phase, + "parts": [], + "type": "cylinder_thermostat", + "measure_type": "cylinder_thermostat", + "description": description, + "starting_u_value": None, + "new_u_value": None, + "sap_points": None, + "already_installed": already_installed, + **total_cost, + "simulation_config": simulation_config, + "description_simulation": { + "hot-water-energy-eff": simulation_config["hot_water_energy_eff_ending"], + "hotwater-description": new_epc_description, + }, + "survey": False + } + + self.recommendations.append(to_append) return diff --git a/recommendations/LightingRecommendations.py b/recommendations/LightingRecommendations.py index f9a1d63a..3447394d 100644 --- a/recommendations/LightingRecommendations.py +++ b/recommendations/LightingRecommendations.py @@ -4,6 +4,7 @@ from backend.Property import Property from typing import List from recommendations.Costs import Costs from recommendations.recommendation_utils import override_costs +from backend.ml_models.AnnualBillSavings import AnnualBillSavings class LightingRecommendations: @@ -161,6 +162,7 @@ class LightingRecommendations: # the proportion of lights that will be set to low energy "sap_points": sap_points, "kwh_savings": heat_demand_change, + "energy_cost_savings": heat_demand_change * AnnualBillSavings.ELECTRICITY_PRICE_CAP, "co2_equivalent_savings": carbon_change, "description_simulation": { "lighting-energy-eff": "Very Good", diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py index dd51b47d..0e73cffe 100644 --- a/recommendations/Recommendations.py +++ b/recommendations/Recommendations.py @@ -142,19 +142,17 @@ class Recommendations: # Ventilation recommendations # We only produce a ventilation recommendation if the property is recommended to have wall or roof - # insulation - # We will not attribute a SAP impact to the ventilation recommendation, since we've seen that this - # has no - # real impact on the SAP score. Therefore, we don't need to include phasing for ventilation. If we - # have any - # wall or roof recommendations, we will ensure that ventilation is included in the simulation + # insulation We will not attribute a SAP impact to the ventilation recommendation, since we've seen that this + # has no real impact on the SAP score. Therefore, we don't need to include phasing for ventilation. If we + # have any wall or roof recommendations, we will ensure that ventilation is included in the simulation if ( (self.wall_recomender.recommendations or self.roof_recommender.recommendations) and ("ventilation" in measures) ): - self.ventilation_recomender.recommend() + self.ventilation_recomender.recommend(phase=phase) if self.ventilation_recomender.recommendation: property_recommendations.append(self.ventilation_recomender.recommendation) + phase += 1 if "trickle_vents" in measures: # This is a recommendatin that typically comes from an energy assessment @@ -211,27 +209,25 @@ class Recommendations: measures=measures, has_cavity_or_loft_recommendations=has_cavity_or_loft_recommendations, ) - if ( - self.heating_recommender.heating_recommendations or - self.heating_recommender.heating_control_recommendations - ): + if self.heating_recommender.heating_recommendations: # We split into first and second phase recommendations first_phase_recommendations = [ r for r in ( - self.heating_recommender.heating_recommendations + - self.heating_recommender.heating_control_recommendations + self.heating_recommender.heating_recommendations ) if r["phase"] == phase ] second_phase_recommendations = [ r for r in ( - self.heating_recommender.heating_recommendations + - self.heating_recommender.heating_control_recommendations + self.heating_recommender.heating_recommendations ) if r["phase"] == phase + 1 ] + if first_phase_recommendations and second_phase_recommendations: + raise Exception("Imeplement me") + if first_phase_recommendations: property_recommendations.append(first_phase_recommendations) @@ -243,8 +239,7 @@ class Recommendations: # otherwise we incremenet by 1 max_used_phase = max( [rec["phase"] for rec in - self.heating_recommender.heating_recommendations + - self.heating_recommender.heating_control_recommendations] + self.heating_recommender.heating_recommendations] ) amount_to_increment = max_used_phase - phase + 1 phase += amount_to_increment @@ -253,8 +248,13 @@ class Recommendations: if "hot_water" in measures: self.hotwater_recommender.recommend(phase=phase) if self.hotwater_recommender.recommendations: - property_recommendations.append(self.hotwater_recommender.recommendations) - phase += 1 + if len(self.hotwater_recommender.recommendations) > 1: + for r in self.hotwater_recommender.recommendations: + property_recommendations.append([r]) + phase += 1 + else: + property_recommendations.append(self.hotwater_recommender.recommendations) + phase += 1 if "secondary_heating" in measures: self.secondary_heating_recommender.recommend(phase=phase) @@ -304,12 +304,12 @@ class Recommendations: # want to include the cavity wall insulation recommendation in the defaults if recommendations_by_type[0].get("type") in [ - "mechanical_ventilation", "trickle_vents", "draught_proofing" + "trickle_vents", "draught_proofing" ]: continue has_u_value = recommendations_by_type[0].get("new_u_value") is not None - has_sap_points = recommendations_by_type[0].get("sap_points") is not None + has_sap_points = all([r.get("sap_points") is not None for r in recommendations_by_type]) has_rank = recommendations_by_type[0].get("rank") is not None # When check if these recommendations have two different types, such as solid wall insulation @@ -447,6 +447,7 @@ class Recommendations: property_instance, all_predictions, recommendations, + representative_recommendations, ): """ @@ -460,6 +461,7 @@ class Recommendations: :param property_instance: Instance of the Property class, for the home associated to property_id :param all_predictions: dictionary of predictions from the model apis :param recommendations: dictionary of recommendations for the property + :param representative_recommendations: dictionary of representative recommendations for the property :return: """ @@ -471,15 +473,20 @@ class Recommendations: property_recommendations = recommendations[property_instance.id].copy() + representative_recs = representative_recommendations[property_instance.id].copy() + representative_ids = [r["recommendation_id"] for r in representative_recs] + increasing_variables = ["sap"] decreasing_variables = ["carbon", "heat_demand"] + # If the recommendation is mechanical ventilation, we don't apply the rule that the new value should be higher + mv_increasing_variables = ["carbon", "heat_demand"] + mv_decreasing_variables = ["sap"] + impact_summary = [] for recommendations_by_type in property_recommendations: for rec in recommendations_by_type: - if rec["type"] in [ - "mechanical_ventilation", "trickle_vents", "draught_proofing", "extension_cavity_wall_insulation" - ]: + if rec["type"] in ["trickle_vents", "draught_proofing", "extension_cavity_wall_insulation"]: # We don't have a percieved sap impact of mechanical ventilation or trickle vents, and we don't # have the capacity to score draught proofing if rec["type"] == "extension_cavity_wall_insulation": @@ -497,7 +504,9 @@ class Recommendations: impact_summary.append( { "phase": rec["phase"], + "representative": rec["recommendation_id"] in representative_ids, "recommendation_id": rec["recommendation_id"], + "measure_type": rec["measure_type"], "sap": sap + rec["sap_points"], "carbon": carbon - rec["co2_equivalent_savings"], "heat_demand": heat_demand - rec["heat_demand"], @@ -519,15 +528,21 @@ class Recommendations: # heating_cost_starting and heating_cost_ending are just the values in the EPC. However, with # heating_cost_ending, we expect that the EPC will predict a heating cost based on what would happen # if we implemented the recommendation today, so our starting value is the EPC + previous_phase_values = { "sap": float(property_instance.data["current-energy-efficiency"]), + # For carbon, even though we generally use the updated figure which includes the carbon + # associated to appliances, for this scoring process we use the EPC carbon value. This means + # that we don't overestimate the impact since the model uses the EPC carbon value "carbon": float(property_instance.data["co2-emissions-current"]), "heat_demand": float(property_instance.data["energy-consumption-current"]), } else: - previous_phase_values_multiple = [x for x in impact_summary if x["phase"] == (rec["phase"] - 1)] + previous_phase_values_multiple = [ + x for x in impact_summary if x["phase"] == (rec["phase"] - 1) and x["representative"] + ] if len(previous_phase_values_multiple) != 1: # Take an average of each of the previous phases keys_to_median = ["sap", "carbon", "heat_demand"] @@ -541,8 +556,13 @@ class Recommendations: previous_phase_values = previous_phase_values_multiple[0] # We extract the values for the current phase + if rec.get("survey", False): + current_phase_sap = rec["sap_points"] + previous_phase_values["sap"] + else: + current_phase_sap = phase_energy_efficiency_metrics["sap_change"] + current_phase_values = { - "sap": phase_energy_efficiency_metrics["sap_change"], + "sap": current_phase_sap, "carbon": phase_energy_efficiency_metrics["carbon_change"], "heat_demand": phase_energy_efficiency_metrics["heat_demand"], } @@ -552,13 +572,23 @@ class Recommendations: # For decreasing variables, the new value should be lower than the previous, otherwise we set it to # the previous # In either case, we adjudge the recommendation to have had no/negligible impact - for v in increasing_variables: + # However, if the recommendation is mechanical ventilation, this can have a negative SAP impact so + # we don't apply this rule + + if rec["type"] == "mechanical_ventilation": + phase_increasing_variables = mv_increasing_variables + phase_decreasing_variables = mv_decreasing_variables + else: + phase_increasing_variables = increasing_variables + phase_decreasing_variables = decreasing_variables + + for v in phase_increasing_variables: current_phase_values[v] = ( current_phase_values[v] if current_phase_values[v] > previous_phase_values[v] else previous_phase_values[v] ) for v in previous_phase_values: - if v in decreasing_variables: + if v in phase_decreasing_variables: current_phase_values[v] = ( current_phase_values[v] if current_phase_values[v] < previous_phase_values[v] else previous_phase_values[v] @@ -573,13 +603,19 @@ class Recommendations: "heat_demand": previous_phase_values["heat_demand"] - current_phase_values["heat_demand"], } - # Prevent from being negative + # Prevent from being negative - apart from ventilation for metric in ["sap", "carbon", "heat_demand"]: - property_phase_impact[metric] = ( - 0 if property_phase_impact[metric] < 0 else property_phase_impact[metric] - ) - if metric == "sap": - property_phase_impact[metric] = round(property_phase_impact[metric], 2) + if rec["type"] != "mechanical_ventilation": + property_phase_impact[metric] = ( + 0 if property_phase_impact[metric] < 0 else property_phase_impact[metric] + ) + if metric == "sap": + property_phase_impact[metric] = round(property_phase_impact[metric], 2) + else: + # We prevent these from being positive + property_phase_impact[metric] = ( + 0 if property_phase_impact[metric] > 0 else property_phase_impact[metric] + ) # For the moment, we cap the number of SAP points that can be achieved by LEDs at 2 if rec["type"] == "low_energy_lighting": @@ -599,11 +635,18 @@ class Recommendations: # By limiting here, we don't change the value in current_phase_values. This means that the # future recommendations won't have an impact that is too large li_sap_limit = RoofRecommendations.get_loft_insulation_sap_limit( - property_instance.data["roof-energy-eff"], property_instance.data["extension-count"] + property_instance.data["roof-energy-eff"], property_instance.roof["insulation_thickness"] ) if li_sap_limit is not None: property_phase_impact["sap"] = min(property_phase_impact["sap"], li_sap_limit) + if rec["type"] == "solar_pv": + # We use the SAP points in the recommendation as a minimum + property_phase_impact["sap"] = ( + rec["sap_points"] if property_phase_impact["sap"] < rec["sap_points"] else + property_phase_impact["sap"] + ) + # Insert this information into the recommendation. if not rec.get("survey", False): rec["sap_points"] = property_phase_impact["sap"] @@ -620,7 +663,9 @@ class Recommendations: impact_summary.append( { "phase": rec["phase"], + "representative": rec["recommendation_id"] in representative_ids, "recommendation_id": rec["recommendation_id"], + "measure_type": rec["measure_type"], **current_phase_values } ) @@ -628,7 +673,9 @@ class Recommendations: return property_recommendations, impact_summary @staticmethod - def map_descriptions_to_fuel(heating_description, hotwater_description, main_fuel_description): + def map_descriptions_to_fuel( + heating_description, hotwater_description, main_fuel_description, descriptions_to_fuel_types + ): # Handle the case of community schemes if (heating_description == "Community scheme") or (hotwater_description == "Community scheme"): @@ -641,7 +688,7 @@ class Recommendations: } raise NotImplementedError("Handle this case") - mapped = assumptions.DESCRIPTIONS_TO_FUEL_TYPES[heating_description] + mapped = descriptions_to_fuel_types[heating_description] heating_fuel = mapped["fuel"] if hotwater_description in [ @@ -661,7 +708,7 @@ class Recommendations: "heating_cop": mapped["cop"], "hotwater_cop": 1 } - mapped_hotwater = assumptions.DESCRIPTIONS_TO_FUEL_TYPES[hotwater_description] + mapped_hotwater = descriptions_to_fuel_types[hotwater_description] return { "heating_fuel_type": heating_fuel, "hotwater_fuel_type": mapped_hotwater["fuel"], @@ -670,17 +717,24 @@ class Recommendations: @classmethod def calculate_recommendation_tenant_savings( - cls, property_instance, kwh_simulation_predictions, property_recommendations + cls, property_instance, kwh_simulation_predictions, property_recommendations, ashp_cop=None ): """ This method inserts the kwh savings and the bill savings that the customer will make from the recommendations based on the predictions from the ML model + + It also ensures we base our solar savings and solar carbon savings from the calculations based on + the solar API and size of the array, instead of ML model + :param property_instance: Instance of the Property class, for the home associated to property_id :param kwh_simulation_predictions: dictionary of predictions from the model apis :param property_recommendations: dictionary of recommendations for the property + :param ashp_cop: The coefficient of performance for the air source heat pump. :return: """ + ashp_cop = ashp_cop if ashp_cop else assumptions.AVERAGE_ASHP_EFFICIENCY + kwh_impact_table = kwh_simulation_predictions["heating_kwh_predictions"][ kwh_simulation_predictions["heating_kwh_predictions"]["property_id"] == str(property_instance.id) ].merge( @@ -739,22 +793,42 @@ class Recommendations: ] ).sort_values(["phase", "recommendation_id"], ascending=True).reset_index(drop=True) + # We need the recommendaion type + rec_id_to_type = { + rec["recommendation_id"]: rec["type"] for recs in property_recommendations for rec in recs + } + rec_id_to_type[STARTING_DUMMY_ID_VALUE] = "starting_dummy" + for i in range(0, len(kwh_impact_table)): - current_phase = kwh_impact_table.loc[i, 'phase'] + current = kwh_impact_table.loc[i] + current_phase = current['phase'] previous_phase_id = (current_phase - 1) if (current_phase > 0) else -9999 previous_phase = kwh_impact_table[kwh_impact_table['phase'] == previous_phase_id] if not previous_phase.empty: for col in ["predictions_heating", "predictions_hotwater"]: + # Check if the recommendation type is ventilation + if rec_id_to_type[current["recommendation_id"]] == "mechanical_ventilation": + # We expect the kwh to increase + if kwh_impact_table.loc[i, col] > previous_phase[col].max(): + continue + if kwh_impact_table.loc[i, col] > previous_phase[col].max(): kwh_impact_table.loc[i, col] = previous_phase[col].max() + descriptions_to_fuel_types = assumptions.DESCRIPTIONS_TO_FUEL_TYPES + # We will the air source heat pump efficiencies + ashp_keys = [k for k in descriptions_to_fuel_types.keys() if "air source heat pump" in k.lower()] + for k in ashp_keys: + descriptions_to_fuel_types[k]["cop"] = ashp_cop + # For heating system recommendations, this could result in a fuel type change so we reflect that fuel_mapping = pd.DataFrame([ { "id": epc["id"], **cls.map_descriptions_to_fuel( - epc["mainheat-description"], epc["hotwater-description"], epc["main-fuel"] + epc["mainheat-description"], epc["hotwater-description"], epc["main-fuel"], + descriptions_to_fuel_types ) } for epc in property_instance.updated_simulation_epcs ]) @@ -768,7 +842,8 @@ class Recommendations: **cls.map_descriptions_to_fuel( property_instance.data["mainheat-description"], property_instance.data["hotwater-description"], - property_instance.data["main-fuel"] + property_instance.data["main-fuel"], + descriptions_to_fuel_types ) } ] @@ -797,7 +872,7 @@ class Recommendations: for recs in property_recommendations: for rec in recs: if rec["type"] in [ - "mechanical_ventilation", "trickle_vents", "draught_proofing", "extension_cavity_wall_insulation" + "trickle_vents", "draught_proofing", "extension_cavity_wall_insulation" ]: # We cannot score the impact on draught proofing continue @@ -808,6 +883,12 @@ class Recommendations: if rec["type"] == "solar_pv": rec["kwh_savings"] = rec_impact["solar_kwh_savings"].values[0] + + # Calculate carbon savings from this - emissions in kg and convert to tonnes + emissions_kg = rec["kwh_savings"] * assumptions.ELECTRICITY_CARBON_INTENSITY + emissions_tonnes = emissions_kg / 1000 + + rec["co2_equivalent_savings"] = emissions_tonnes rec["energy_cost_savings"] = ( rec_impact["solar_kwh_savings"].values[0] * AnnualBillSavings.ELECTRICITY_PRICE_CAP ) @@ -816,13 +897,18 @@ class Recommendations: heating_kwh_savings = ( previous_phase_impact["predictions_heating"].mean() - rec_impact["predictions_heating"].values[0] ) - heating_cost_savings = ( - previous_phase_impact["heating_cost"].mean() - rec_impact["heating_cost"].values[0] - ) - hotwater_kwh_savings = ( previous_phase_impact["predictions_hotwater"].mean() - rec_impact["predictions_hotwater"].values[0] ) + + # Shouldn't be positive + if rec["type"] == "mechanical_ventilation": + heating_kwh_savings = 0 if heating_kwh_savings > 0 else heating_kwh_savings + hotwater_kwh_savings = 0 if hotwater_kwh_savings > 0 else hotwater_kwh_savings + + heating_cost_savings = ( + previous_phase_impact["heating_cost"].mean() - rec_impact["heating_cost"].values[0] + ) hotwater_host = ( previous_phase_impact["hotwater_cost"].mean() - rec_impact["hotwater_cost"].values[0] ) @@ -830,9 +916,8 @@ class Recommendations: total_kwh_savings = heating_kwh_savings + hotwater_kwh_savings energy_cost_savings = heating_cost_savings + hotwater_host - if rec["type"] == "lighting": - # In this case, we should probably just SKIP but check when we have one! - raise Exception("Implement me 3") + if rec["type"] == "low_energy_lighting": + continue rec["kwh_savings"] = total_kwh_savings rec["energy_cost_savings"] = energy_cost_savings diff --git a/recommendations/RoofRecommendations.py b/recommendations/RoofRecommendations.py index c0fa4eb2..cd7f82c4 100644 --- a/recommendations/RoofRecommendations.py +++ b/recommendations/RoofRecommendations.py @@ -52,6 +52,10 @@ class RoofRecommendations: part for part in materials if part["type"] == "flat_roof_insulation" ] + self.room_roof_insulation_materials = [ + part for part in materials if part["type"] == "room_roof_insulation" + ] + # Extract the insulation thickness from the roof, which is used throughout this method self.insulation_thickness = convert_thickness_to_numeric( self.property.roof["insulation_thickness"], @@ -60,16 +64,16 @@ class RoofRecommendations: ) @classmethod - def get_loft_insulation_sap_limit(cls, roof_energy_eff, extension_count): + def get_loft_insulation_sap_limit(cls, roof_energy_eff, existing_thickness): """ Get the SAP limit for loft insulation :param roof_energy_eff: :return: """ - if extension_count == 0: - # No limit - return None + if str(existing_thickness).isdigit(): + if float(existing_thickness) >= 250: + return 0 if roof_energy_eff in ["Good", "Very Good"]: return 1 @@ -123,7 +127,11 @@ class RoofRecommendations: self.property.roof["insulation_thickness"] in ["average", "above_average"] ) - return full_insulated_room_roof or room_roof_insulated_at_rafters + has_non_invasive_recommendation = any( + x["type"] == "room_roof_insulation" for x in self.property.non_invasive_recommendations + ) + + return (full_insulated_room_roof or room_roof_insulated_at_rafters) and not has_non_invasive_recommendation def recommend(self, phase, measures=None, default_u_values=False): @@ -134,6 +142,10 @@ class RoofRecommendations: u_value = self.property.roof["thermal_transmittance"] + # If we have a flat roof but we don't have flat roof as a measure, we exit + if self.property.roof["is_flat"] and "flat_roof_insulation" not in measures: + return + # We check if the roof is already insulated and if so, we exit # Building regulations part L recommend installing at least 270mm of insulation, however generally we @@ -148,6 +160,9 @@ class RoofRecommendations: if self.is_room_roof_insulated_or_unsuitable(measures): return + if self.property.roof["is_thatched"]: + return + # If we have a u-value already, need to implement this if u_value: if u_value <= self.BUILDING_REGULATIONS_PART_L_MAX_U_VALUE: @@ -181,7 +196,8 @@ class RoofRecommendations: # We firstly handle non-intrusive recommendations, which may override the normal roof insulation recommendations if ("loft_insulation" in [x["type"] for x in non_invasive_recommendations]) or ( - self.property.roof["is_pitched"] and "loft_insulation" in measures + self.property.roof["is_pitched"] and "loft_insulation" in measures and + not self.property.roof["is_at_rafters"] ): self.recommend_roof_insulation( u_value=u_value, @@ -282,6 +298,11 @@ class RoofRecommendations: insulation_materials = pd.DataFrame(insulation_materials) + non_invasive_recommendations = next( + (r for r in self.property.non_invasive_recommendations if + r["type"] == insulation_materials["type"].values[0]), {} + ) + lowest_selected_u_value = None recommendations = [] for _, insulation_material_group in insulation_materials.groupby("description"): @@ -421,14 +442,15 @@ class RoofRecommendations: "description": self.make_roof_insulation_description(material), "starting_u_value": u_value, "new_u_value": new_u_value, - "sap_points": None, + "sap_points": non_invasive_recommendations.get("sap_points", 0), "already_installed": already_installed, "simulation_config": simulation_config, "description_simulation": { "roof-description": new_description, "roof-energy-eff": new_efficiency }, - **cost_result + **cost_result, + "survey": non_invasive_recommendations.get("survey", False) } ) @@ -478,28 +500,22 @@ class RoofRecommendations: :return: """ - # TODO: We temporarilty use costs from SCIS for RIR insulation. The costing was £180/m2 floor - roof_roof_insulation_materials = [ - { - "type": "room_roof_insulation", - "description": "Insulating the ceiling of the roof roof and re-decorate", - "depths": [100], - "depth_unit": "mm", - "r_value_per_mm": 0.038, - "thermal_conductivity": 0.022, - "cost": [180], - } - ] + # We have a list of materials that can be used for room roof insulation + # We will iterate over these materials and recommend them based on the current u-value of the roof + # and the cost of the materials rir_non_invasive_recommendation = next( (x for x in self.property.non_invasive_recommendations if x["type"] == "room_roof_insulation"), {} ) + insulation_materials = pd.DataFrame(self.room_roof_insulation_materials) + # lowest_selected_u_value = None recommendations = [] - for material in roof_roof_insulation_materials: - for depth, cost_per_unit in zip(material["depths"], material["cost"]): - part_u_value = r_value_per_mm_to_u_value(depth, material["r_value_per_mm"]) + for _, material_group in insulation_materials.groupby("description"): + for material in material_group.itertuples(): + + part_u_value = r_value_per_mm_to_u_value(material.depth, material.r_value_per_mm) _, new_u_value = calculate_u_value_uplift(u_value, part_u_value) new_u_value = math.ceil(new_u_value * 100.0) / 100.0 @@ -507,13 +523,11 @@ class RoofRecommendations: # We allow a small tolerance for error so we don't discount the recommendation entirely estimated_cost = ( - cost_per_unit * self.property.insulation_floor_area if + material.total_cost * self.property.insulation_floor_area if rir_non_invasive_recommendation.get("cost") is None else rir_non_invasive_recommendation.get("cost") ) - sap_points = rir_non_invasive_recommendation.get("sap_points", None) - # Could also be Roof room(s), ceiling insulated new_descriptin = "Roof room(s), insulated" roof_ending_config = RoofAttributes(new_descriptin).process() @@ -562,7 +576,7 @@ class RoofRecommendations: "description": "Insulate room in roof at rafters and re-decorate", "starting_u_value": u_value, "new_u_value": new_u_value, - "sap_points": sap_points, + "sap_points": rir_non_invasive_recommendation.get("sap_points", None), "simulation_config": simulation_config, "description_simulation": { "roof-description": new_descriptin, diff --git a/recommendations/SecondaryHeating.py b/recommendations/SecondaryHeating.py index 7c20bcdd..e63951d9 100644 --- a/recommendations/SecondaryHeating.py +++ b/recommendations/SecondaryHeating.py @@ -9,12 +9,6 @@ class SecondaryHeating: system. """ - # The list of existing heating systems that are accepted - ACCEPTED_MAINHEAT_DESCRIPTIONS = ["Boiler and radiators, mains gas"] - ACCEPTED_SECONDHEAT_DESCRIPTIONS = ["Room heaters, electric"] - # These are the heaters where works are required to remove them - FIXED_HEATER_DESCRIPTIONS = ["Room heaters, electric"] - def __init__(self, property_instance: Property): self.property = property_instance self.costs = Costs(self.property) @@ -25,18 +19,10 @@ class SecondaryHeating: # Reset self.recommendation = [] - if self.property.main_heating["clean_description"] not in self.ACCEPTED_MAINHEAT_DESCRIPTIONS: - return - - # TODO: We need to clean secondary data - if self.property.data['secondheat-description'] not in self.ACCEPTED_SECONDHEAT_DESCRIPTIONS: - return - - if self.property.data['secondheat-description'] in self.FIXED_HEATER_DESCRIPTIONS: - # We have an associated cost otherwise, there is no cost - n_rooms = self.property.data['number-heated-rooms'] + if self.property.data['number-habitable-rooms'] > self.property.data['number-heated-rooms']: + n_rooms = self.property.data['number-habitable-rooms'] - self.property.data['number-heated-rooms'] else: - n_rooms = 0 + n_rooms = self.property.data["number-heated-rooms"] costs = self.costs.heater_removal(n_rooms=n_rooms) diff --git a/recommendations/SolarPvRecommendations.py b/recommendations/SolarPvRecommendations.py index 66c1d0c3..ee07ff28 100644 --- a/recommendations/SolarPvRecommendations.py +++ b/recommendations/SolarPvRecommendations.py @@ -1,24 +1,39 @@ import numpy as np import pandas as pd +import backend.app.assumptions as assumptions from recommendations.Costs import Costs from recommendations.recommendation_utils import override_costs, estimate_pitched_roof_area class SolarPvRecommendations: - # Solar panel specs based on Eurener 400s solar panels - # https://midsummerwholesale.co.uk/buy/eurener/eurener-400w-mepv-zebra-ab-half-cut-mono - # Approximate area of the solar panels - SOLAR_PANEL_AREA = 1.79 - # Wattage per panel - this is based on the average wattage of a solar panel being between 250w and 420w - # This was previously set to 250w, but has been upped to 400 based on the systems used by Cotswolrd Energy Group - SOLAR_PANEL_WATTAGE = 400 - + # For domestic properties, we don't recommend a solar PV system with wattage outside of these + # bounds MAX_SYSTEM_WATTAGE = 6000 MIN_SYSTEM_WATTAGE = 1000 + # the maximum area of root we allow to be covered in solar panels for our recommendations. MAX_ROOF_AREA_PERCENTAGE = 0.7 + SAP_POINTS_PER_5_PERCENT_ROOF_COVERAGE = 1 + + BACKUP_PANEL_PERFORMANCE = pd.DataFrame( + [ + { + "n_panels": 4, + "array_wattage": 1600, + "initial_ac_kwh_per_year": assumptions.MEDIAN_WATTAGE_TO_AC * 1600, + "panneled_roof_area": 4 * assumptions.RDSAP_AREA_PER_PANEL + }, + { + "n_panels": 8, + "array_warrage": 3200, + "initial_ac_kwh_per_year": assumptions.MEDIAN_WATTAGE_TO_AC * 3200, + "panneled_roof_area": 8 * assumptions.RDSAP_AREA_PER_PANEL + }, + ] + ) + def __init__(self, property_instance): """ :param property_instance: Instance of the Property class, for the home associated to property_id @@ -42,46 +57,6 @@ class SolarPvRecommendations: return trimmed_list - def mds_recommend(self, phase=None, solar_pv_percentage=0.5): - # For specific usage within the mds report - - solar_pv_roof_area = self.property.get_solar_pv_roof_area(solar_pv_percentage) - - number_solar_panels = np.floor(solar_pv_roof_area / self.SOLAR_PANEL_AREA) - solar_panel_wattage = number_solar_panels * self.SOLAR_PANEL_WATTAGE - - solar_panel_wattage = np.clip( - a=solar_panel_wattage, a_min=self.MIN_SYSTEM_WATTAGE, a_max=self.MAX_SYSTEM_WATTAGE - ) - - # We now have a property which is potentially suitable for solar PV - roof_coverage_percent = round(solar_pv_percentage * 100) - # Given the wattage, we estimate the cost of the solar PV system. This is based on the MCS database - # of solar PV installations - cost_result = self.costs.solar_pv(wattage=solar_panel_wattage, has_battery=False) - kw = np.floor(solar_panel_wattage / 100) / 10 - - description = (f"Install a {kw} kilowatt-peak (kWp) solar photovoltaic (PV) p" - f"anel system on {round(roof_coverage_percent)}% the roof.") - - return [ - { - "phase": phase, - "parts": [], - "type": "solar_pv", - "description": description, - "starting_u_value": None, - "new_u_value": None, - "sap_points": None, - "already_installed": False, - **cost_result, - # This is required for simulating the SAP impact. solar_pv_percentage is between 0 & 1 so we scale - # back up here - "photo_supply": roof_coverage_percent, - "has_battery": False - } - ] - def recommend_building_analysis(self, phase): """ This recommendation approach handles the case of producing solar PV recommendations at the building level, @@ -103,13 +78,22 @@ class SolarPvRecommendations: for rank, recommendation_config in best_configurations.iterrows(): # If we dont have the panneled_roof_area in the recommendation_config we calculate it if recommendation_config.get("panneled_roof_area", None): - roof_coverage_percent = round(recommendation_config["panneled_roof_area"] / total_roof_area * 100) + # We spread the coverage across the individual units + roof_coverage_percent = round( + ((recommendation_config["panneled_roof_area"] / total_roof_area) * 100) / n_units + ) else: raise Exception("IMPLEMENT ME") + + n_floors = ( + self.property.number_of_storeys["number_of_storeys"] if + self.property.number_of_storeys["number_of_storeys"] is not None else 3 + ) + total_cost = self.costs.solar_pv( array_cost=recommendation_config.get("cost", None), n_panels=recommendation_config["n_panels"], - n_floors=self.property.number_of_storeys["number_of_storeys"], + n_floors=n_floors, needs_inverter=True, )["total"] / n_units @@ -203,6 +187,20 @@ class SolarPvRecommendations: roof_coverage_percent = round(recommendation_config["panneled_roof_area"] / roof_area * 100) # We round up to the nearest 5 roof_coverage_percent = np.ceil(roof_coverage_percent / 5) * 5 + + # Typically, we've observed that every 5% of additional roof coverage will result in at least + # an additional 1 SAP points (though often 2 points) Given this, we can add a reasonable minimum + # for the number of SAP points we might expect. We've observed that for some cases where properties + # are hitting the higher SAP scores (e.g. EPC A and above), the model can sometimes under-predict + # the number of SAP points. This appears to be due to a relatively small number of properties + # actually achieving the upper echelons of EPC rating. This can be the case if we're simulating a + # whole house retrofit where the home is getting complete insulation, a heat pump and solar panels. + # Because panels are the final recommendation, they are often the measure that takes the home + # into the medium to high EPC A ranges and so because of a lack of training data, this means that + # we might sometime under-predict. This minimum is intended to try and reduce the negative impact + # of this. This minimum is used in Recommendations.calculate_recommendation_impact + minimum_sap_points = (roof_coverage_percent / 5) * self.SAP_POINTS_PER_5_PERCENT_ROOF_COVERAGE + for has_battery in [False, True]: cost_result = self.costs.solar_pv( has_battery=has_battery, @@ -212,11 +210,14 @@ class SolarPvRecommendations: ) kw = np.floor(recommendation_config["array_wattage"] / 100) / 10 if has_battery: - description = (f"Install a {kw} kilowatt-peak (kWp) solar photovoltaic (PV) panel system on " - f"{round(roof_coverage_percent)}% the roof, with a battery storage system.") + description = ( + f"Install a {kw} kilowatt-peak (kWp) solar panel system, with a battery." + ) else: - description = (f"Install a {kw} kilowatt-peak (kWp) solar photovoltaic (PV) p" - f"anel system on {round(roof_coverage_percent)}% the roof.") + description = f"Install a {kw} kilowatt-peak (kWp) solar panel system." + + if self.property.in_conservation_area: + description += " Property is in a consevation area - please check with local planning authority." already_installed = "solar_pv" in self.property.already_installed if already_installed: @@ -231,7 +232,7 @@ class SolarPvRecommendations: "description": description, "starting_u_value": None, "new_u_value": None, - "sap_points": None, + "sap_points": minimum_sap_points, "already_installed": already_installed, **cost_result, # This is required for simulating the SAP impact. solar_pv_percentage is between 0 & 1 so we diff --git a/recommendations/VentilationRecommendations.py b/recommendations/VentilationRecommendations.py index 9738b898..a82e4df5 100644 --- a/recommendations/VentilationRecommendations.py +++ b/recommendations/VentilationRecommendations.py @@ -29,7 +29,7 @@ class VentilationRecommendations(Definitions): def identify_ventilation(self): self.has_ventilaion = self.property.data["mechanical-ventilation"] in self.VENTILATION_DESCRIPTIONS - def recommend(self): + def recommend(self, phase): """ If there is no ventilation, we recommend installing ventilation @@ -63,7 +63,7 @@ class VentilationRecommendations(Definitions): # We recommend installing two mechanical ventilation systems self.recommendation = [ { - "phase": None, + "phase": phase, "parts": part, "type": part[0]["type"], "measure_type": "mechanical_ventilation", @@ -79,7 +79,13 @@ class VentilationRecommendations(Definitions): "total": estimated_cost, # We use a very simple and rough estimate of 4 hours per unit "labour_hours": labour_hours, - "labour_days": labour_days # Assume 8 hour day + "labour_days": labour_days, # Assume 8 hour day + "simulation_config": { + "mechanical_ventilation_ending": "mechanical, extract only", + }, + "description_simulation": { + "mechanical-ventilation": "mechanical, extract only" + } } ] diff --git a/recommendations/WallRecommendations.py b/recommendations/WallRecommendations.py index c7917911..92147fb8 100644 --- a/recommendations/WallRecommendations.py +++ b/recommendations/WallRecommendations.py @@ -385,6 +385,11 @@ class WallRecommendations(Definitions): if insulation_thickness == "below average": cavity_width = cavity_width * (1 - PARTIALLY_FILLED_PERCENTAGE_ASSUMPTION) + non_invasive_recommendations = next( + (r for r in self.property.non_invasive_recommendations if + r["type"] == insulation_materials["type"].values[0]), {} + ) + # Test the different fill options lowest_selected_u_value = None recommendations = [] @@ -475,14 +480,15 @@ class WallRecommendations(Definitions): "description": description, "starting_u_value": u_value, "new_u_value": new_u_value, - "sap_points": None, + "sap_points": non_invasive_recommendations.get("sap_points", None), "already_installed": already_installed, "simulation_config": simulation_config, "description_simulation": { "walls-description": "Cavity wall, filled cavity", "walls-energy-eff": "Good" }, - **cost_result + **cost_result, + "survey": non_invasive_recommendations.get("survey", False) } ) @@ -540,15 +546,10 @@ class WallRecommendations(Definitions): lowest_selected_u_value = None recommendations = [] - - iwi_non_invasive_recommendations = next( - (r for r in self.property.non_invasive_recommendations if r["type"] == "internal_wall_insulation"), {} + non_invasive_recommendations = next( + (r for r in self.property.non_invasive_recommendations if + r["type"] == insulation_materials["type"].values[0]), {} ) - ewi_non_invasive_recommendations = next( - (r for r in self.property.non_invasive_recommendations if r["type"] == "external_wall_insulation"), {} - ) - if ewi_non_invasive_recommendations: - raise NotImplementedError("Implement ewi non-invasive recommendations") for _, insulation_material_group in insulation_materials.groupby("description"): @@ -590,31 +591,25 @@ class WallRecommendations(Definitions): if already_installed: cost_result = override_costs(cost_result) + if non_invasive_recommendations.get("cost") is not None: + raise NotImplementedError( + "Not handled passing costs from non-invasive recommendations for iwi" + ) + if material["type"] == "internal_wall_insulation": - - if iwi_non_invasive_recommendations.get("cost") is not None: - raise NotImplementedError( - "Not handled passing costs from non-invasive recommendations for iwi" - ) - - sap_points = iwi_non_invasive_recommendations.get("sap_points", None) - survey = iwi_non_invasive_recommendations.get("survey", False) - new_description = self.get_internal_external_wall_description( self.INTERNALLY_INSULATED_WALL_DESCRIPTIONS, new_u_value ) - elif material["type"] == "external_wall_insulation": - - sap_points = ewi_non_invasive_recommendations.get("sap_points", None) - survey = ewi_non_invasive_recommendations.get("survey", False) - new_description = self.get_internal_external_wall_description( self.EXTERNALLY_INSULATED_WALL_DESCRIPTIONS, new_u_value ) else: raise ValueError("Invalid material type") + sap_points = non_invasive_recommendations.get("sap_points", None) + survey = non_invasive_recommendations.get("survey", False) + wall_ending_config = WallAttributes(new_description).process() walls_simulation_config = check_simulation_difference( diff --git a/recommendations/WindowsRecommendations.py b/recommendations/WindowsRecommendations.py index 1f755369..46e56c93 100644 --- a/recommendations/WindowsRecommendations.py +++ b/recommendations/WindowsRecommendations.py @@ -215,21 +215,29 @@ class WindowsRecommendations: "glazed-type": glazed_type_ending, } + measure_type = "double_glazing" if not is_secondary_glazing else "secondary_glazing" + + non_invasive_recommendation = next( + (r for r in self.property.non_invasive_recommendations if r["type"] in ["windows_glazing", measure_type]), + {} + ) + self.recommendation = [ { "phase": phase, "parts": [], "type": "windows_glazing", - "measure_type": "double_glazing" if not is_secondary_glazing else "secondary_glazing", + "measure_type": measure_type, "description": description, "starting_u_value": None, "new_u_value": None, - "sap_points": None, + "sap_points": non_invasive_recommendation.get("sap_points", None), "already_installed": already_installed, **cost_result, "is_secondary_glazing": is_secondary_glazing, "description_simulation": description_simulation, "simulation_config": simulation_config, + "survey": non_invasive_recommendation.get("survey", None), } ] diff --git a/recommendations/county_to_region.py b/recommendations/county_to_region.py index f7d5193f..13c1cdaa 100644 --- a/recommendations/county_to_region.py +++ b/recommendations/county_to_region.py @@ -111,8 +111,11 @@ county_to_region_map = { 'Windsor and Maidenhead': 'South East England', 'Woking': 'South East England', 'Wokingham': 'South East England', 'Worthing': 'South East England', 'Wycombe': 'South East England', 'Bath and North East Somerset': 'South West England', 'Bournemouth': 'South West England', - 'Bristol': 'South West England', 'Cheltenham': 'South West England', 'Christchurch': 'South West England', - 'City of Bristol': 'South West England', 'Cornwall': 'South West England', 'Cotswold': 'South West England', + 'Bristol': 'South West England', + 'Cheltenham': 'South West England', 'Christchurch': 'South West England', + 'City of Bristol': 'South West England', + 'Bristol, City of': 'South West England', + 'Cornwall': 'South West England', 'Cotswold': 'South West England', 'Devon': 'South West England', 'Dorset': 'South West England', 'East Devon': 'South West England', 'East Dorset': 'South West England', 'Exeter': 'South West England', 'Forest of Dean': 'South West England', 'Gloucester': 'South West England', 'Gloucestershire': 'South West England', @@ -132,7 +135,10 @@ county_to_region_map = { 'Merthyr Tydfil': 'Wales', 'Monmouthshire': 'Wales', 'Mountain Ash': 'Wales', 'Neath Port Talbot': 'Wales', 'Newport': 'Wales', 'Pembrokeshire': 'Wales', 'Penarth': 'Wales', 'Pentre': 'Wales', 'Pontyclun': 'Wales', 'Pontypridd': 'Wales', 'Porth': 'Wales', 'Porthcawl': 'Wales', 'Powys': 'Wales', 'Rhondda Cynon Taff': 'Wales', - 'Rhoose': 'Wales', 'Sully': 'Wales', 'Swansea': 'Wales', 'The Vale of Glamorgan': 'Wales', 'Tonypandy': 'Wales', + 'Rhoose': 'Wales', 'Sully': 'Wales', 'Swansea': 'Wales', + 'The Vale of Glamorgan': 'Wales', + 'Vale of Glamorgan': 'Wales', + 'Tonypandy': 'Wales', 'Torfaen': 'Wales', 'Treharris': 'Wales', 'Treorchy': 'Wales', 'Wrexham': 'Wales', 'Birmingham': 'West Midlands', 'Bromsgrove': 'West Midlands', 'Cannock Chase': 'West Midlands', 'Coventry': 'West Midlands', 'Dudley': 'West Midlands', 'East Staffordshire': 'West Midlands', 'Herefordshire': 'West Midlands', diff --git a/recommendations/optimiser/optimiser_functions.py b/recommendations/optimiser/optimiser_functions.py index c1123e3d..05b9ec42 100644 --- a/recommendations/optimiser/optimiser_functions.py +++ b/recommendations/optimiser/optimiser_functions.py @@ -1,10 +1,14 @@ -def prepare_input_measures(property_recommendations, goal): +import backend.app.assumptions as assumptions + + +def prepare_input_measures(property_recommendations, goal, needs_ventilation): """ Basic function to convert recommendations_to_upload to a format that is suitable for the optimiser - large :param property_recommendations: object containing the recommendations, created in the plan trigger api :param goal: goal to be optimised for, should be one of the keys in gain_map. E.g. if the gain is SAP points, the goal should reflect that desired gain + :param needs_ventilation: boolean to indicate if the property needs ventilation :return: Nested list of input measures """ @@ -16,23 +20,58 @@ def prepare_input_measures(property_recommendations, goal): if not goal_key: raise NotImplementedError("Not implemented this gain type - investigate me") + # We ony ever have one ventilation measure with now + ventilation_recommendation = next( + (measure[0] for measure in property_recommendations if measure[0]["type"] == "mechanical_ventilation"), + {} + ) + input_measures = [] for recs in property_recommendations: + if needs_ventilation and recs[0]["type"] == "mechanical_ventilation": + # If we house needs ventilation, ventilation will be packaged with the fabric measure so + # we don't need to optimise it independently + continue + if recs[0]["type"] == "solar_pv": # if the recommendation is a solar recommendation with a battery, we exclude it from the optimisation. recs = [r for r in recs if ~r["has_battery"]] - input_measures.append( - [ + recs_to_append = [rec for rec in recs if rec["energy_cost_savings"] >= 0] + if not recs_to_append: + continue + + to_append = [] + for rec in recs: + # We bundle the impact of ventilation with the measure + total = ( + rec["total"] + ventilation_recommendation["total"] + if rec["type"] in assumptions.measures_needing_ventilation + else rec["total"] + ) + gain = ( + rec[goal_key] + ventilation_recommendation[goal_key] + if rec["type"] in assumptions.measures_needing_ventilation + else rec[goal_key] + ) + + rec_type = ( + "+".join( + [rec["type"], ventilation_recommendation["type"]] + ) if rec["type"] in assumptions.measures_needing_ventilation + else rec["type"] + ) + + to_append.append( { "id": rec["recommendation_id"], - "cost": rec["total"], - "gain": rec[goal_key], - "type": rec["type"] + "cost": total, + "gain": gain, + "type": rec_type } - for rec in recs - ] - ) + ) + + input_measures.append(to_append) return input_measures diff --git a/recommendations/rdsap_tables.py b/recommendations/rdsap_tables.py index 16c7d26e..e56faf7c 100644 --- a/recommendations/rdsap_tables.py +++ b/recommendations/rdsap_tables.py @@ -257,7 +257,7 @@ epc_wall_description_map = { "Timber frame, as built, partial insulation": "Timber frame as built", "Timber frame, as built, no insulation": "Timber frame as built", "Timber frame, with external insulation": "Timber frame with internal insulation", - + "Timber frame, with internal insulation": "Timber frame with internal insulation", ############################ # Sandstone/limestones wall mappings ############################ diff --git a/recommendations/recommendation_utils.py b/recommendations/recommendation_utils.py index 00da6107..602684cf 100644 --- a/recommendations/recommendation_utils.py +++ b/recommendations/recommendation_utils.py @@ -205,7 +205,7 @@ def get_wall_u_value( mapped_value = wall_uvalues_df[ wall_uvalues_df["Wall_type"] == mapped_description - ][age_band].values[0] + ][age_band].values[0] if pd.isnull(mapped_value) and "Park home" in mapped_description: # We don't know enough in this case so we default to 0 @@ -428,6 +428,9 @@ def estimate_number_of_floors(property_type): Using the property type, we estimate the number of floors in the property """ + if property_type is None: + return None + if property_type == "House": number_of_floors = 2 elif property_type in ["Flat", "Bungalow"]: @@ -560,7 +563,7 @@ def get_floor_u_value( insulation_lookup = s11[ s11["Age_band"].str.contains(age_band) & s11["Floor_construction"] == floor_type - ] + ] if insulation_lookup.empty: insulation_thickness = 0 else: diff --git a/survey_report/app.py b/survey_report/app.py new file mode 100644 index 00000000..f6eddb8d --- /dev/null +++ b/survey_report/app.py @@ -0,0 +1,270 @@ +import os +import requests +import PyPDF2 +from string import Template + +import pandas as pd + +from survey_report.extraction.detect_report_type import detect_report_type +from survey_report.extraction.quidos import SiteNotesExtractor, EPRExtractor + + +def generate_html_report(template_path, output_path, data): + """ + Reads an HTML template file, injects dynamic values, and generates a final HTML report. + + Args: + - template_path (str): Path to the HTML template file. + - output_path (str): Path to save the generated HTML file. + - data (dict): Dictionary containing dynamic values for the report. + """ + # Read the template file + with open(template_path, "r", encoding="utf-8") as f: + html_template = Template(f.read()) # Use Template from string module + + # Replace placeholders with actual data + final_html = html_template.safe_substitute(data) # Use safe_substitute to prevent missing key errors + + # Save the generated HTML file + with open(output_path, "w", encoding="utf-8") as f: + f.write(final_html) + + print(f"HTML report generated successfully: {output_path}") + + +def stringify_number(num: int, rounding: bool = True) -> str: + if num < 100000: # 5 figures or fewer + rounded_num = ((num + 99) // 100) * 100 if rounding else num + return f"{rounded_num:,}" + else: # More than 5 figures + rounded_num = ((num + 999) // 1000) * 1000 if rounding else num + return f"{rounded_num // 1000}k" + + +class PlacidApi: + # Errors as defined by docs: https://placid.app/docs/2.0/rest/errors + ERROR_CODES = { + 400: "Bad request", + 401: "Unauthorized", + 404: "Template Not found", + 422: "Validation error", + 429: "Rate limit exceeded", + 500: "Internal server error", + } + + def __init__(self, api_key): + self.api_key = api_key + + self.headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + "Accept": "application/json", + } + + def create_pdf( + self, + template_uuid: str, + current_epc_rating: str, + current_epc_rating_colour: str, + post_retrofit_epc_rating: str, + post_retrofit_epc_rating_colour: str, + ): + url = "https://api.placid.app/api/rest/pdfs" + + body = { + "webhook_success": None, + "passthrough": None, + "pages": [ + { + "template_uuid": template_uuid, + "layers": { + "current_epc_rating": { + "text": current_epc_rating, + "text_color": current_epc_rating_colour, + }, + "post_retrofit_epc_rating": { + "text": post_retrofit_epc_rating, + "text_color": post_retrofit_epc_rating_colour, + } + }, + }, + ] + } + + response = requests.post( + url, + headers=self.headers, + json=body + ) + + response_body = response.json() + + return response_body + + def get_pdf(self, pdf_id: str): + """ + Poll the API every 5 seconds until the PDF is ready + """ + url = f"https://api.placid.app/api/rest/pdfs/{pdf_id}" + + response = requests.get( + url, + headers=self.headers + ) + response_body = response.json() + + url = response_body["pdf_url"] + # Download the PDF form this uurl + pdf_download = requests.get(url) + with open("survey_report/example_data/output.pdf", "wb") as f: + f.write(pdf_download.content) + + +def handler(): + """ + Performs the data extraction process for the survey report + :return: + """ + + PLACID_API_KEY = "placid-mpkwidzer2mens9h-hifa3dmbxpfeghpa" + TEMPLATE_UUID = "5bst9mh1q9lk9" + placid_api = PlacidApi(PLACID_API_KEY) + + current_property_value = 250000 # Needs to be an input + + EPC_COLOURS = { + "A": "#117d58", + "B": "#2da55c", + "C": "#8dbd40", + "D": "#f7cd14", + "E": "#f3a96a", + "F": "#ef8026", + "G": "#e41e3b", + } + + folders = [ + { + "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 " + "WILLIS ROAD FLAT 1 PRE EPR SITE NOTES.pdf", + "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 WILLIS " + "ROAD FLAT 1 PRE EPR PDF.pdf", + "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data" + "/Flat 1/3 WILLIS ROAD FLAT 1 POST EPR SITE NOTES.pdf" + }, + { + "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 " + "WILLIS ROAD FLAT 2 PRE EPR SITE NOTES.pdf", + "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 WILLIS " + "ROAD FLAT 2 PRE EPR PDF.pdf", + "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data" + "/Flat 2/3 WILLIS ROAD FLAT 2 POST EPR SITE NOTES.pdf" + }, + { + "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 " + "WILLIS ROAD FLAT 3 PRE EPR SITE NOTES.pdf", + "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 WILLIS " + "ROAD FLAT 3 PRE EPR PDF.pdf", + "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data" + "/Flat 3/3 WILLIS ROAD FLAT 3 POST EPR SITE NOTES.pdf" + }, + ] + + data = [] + for data_config in folders: + + file_mapping = {} + for filename, filepath in data_config.items(): + with (open(filepath, "rb") as f): + pdf = PyPDF2.PdfReader(f) + first_page = pdf.pages[0].extract_text() + text = "" + for page in pdf.pages: + text += page.extract_text() + + # Check the report type + report_type = detect_report_type(first_page) + if report_type is not None: + file_mapping[filename] = text + + # This is only set up to work with quido site notes so we must have it + site_notes_extractor = SiteNotesExtractor(file_mapping["site_notes"]) + site_notes = site_notes_extractor.extract_all() + + # We also must have an EPR + epr_extractor = EPRExtractor(file_mapping["epr"]) + epr = epr_extractor.extract_all() + + # Valuation simulation + scenario_site_notes_extractor = SiteNotesExtractor(file_mapping["scenario_site_notes"]) + scenario_site_notes = scenario_site_notes_extractor.extract_all() + + from backend.ml_models.Valuation import PropertyValuation + valuation_uplift = PropertyValuation.estimate_valuation_improvement( + current_value=current_property_value, + current_epc=site_notes["Current EPC Band"], + target_epc=scenario_site_notes["Current EPC Band"], + ) + # TODO - should convert this, when it's more than 5 figures and we should certainly stringify this + + valuation_difference = round(valuation_uplift["average_increased_value"] - current_property_value) + + # Prepare the data for output + bill_savings = round( + site_notes['Estimated Annual Energy Cost (£)'] - scenario_site_notes['Estimated Annual Energy Cost (£)'] + ) + + carbon_savings = round( + site_notes["Current Carbon Emissions (TCO2)"] - scenario_site_notes["Current Carbon Emissions (TCO2)"], + 2 + ) + + payback_period = None + if payback_period is None: + raise NotImplementedError("Implement me") + + # We extract the measures from the site notes + + report_data = { + "current_epc_rating": site_notes["Current EPC Band"], + "current_epc_rating_colour": EPC_COLOURS[site_notes["Current EPC Band"]], + "post_retrofit_epc_rating": scenario_site_notes["Current EPC Band"], + "post_retrofit_epc_rating_colour": EPC_COLOURS[scenario_site_notes["Current EPC Band"]], + "bill_savings": stringify_number(bill_savings), + "valuation_improvement": stringify_number(valuation_difference), + "carbon_savings": carbon_savings, + + } + + # We now produce the combined data sheet which is the starting figure: + # data_sheet = {**epr, **site_notes} + # del data_sheet['Building Dimensions'] + # # We unnest the Total Building Dimensions + # data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"] + # data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"] + # del data_sheet["Total Building Dimensions"] + + create_pdf_response = placid_api.create_pdf( + template_uuid=TEMPLATE_UUID, **report_data + ) + # {'id': 769832, 'type': 'pdf', 'status': 'queued', 'pdf_url': None, 'transfer_url': None, 'passthrough': None} + # Download locally + placid_api.get_pdf(create_pdf_response["id"]) + + data = pd.DataFrame(data) + + # Generate the HTML report + # Placeholder locations + template_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/template.html" + output_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/output/report.html" + logo_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/assets/logo.png" + generate_html_report( + template_path, output_path, + data={ + "address": data_sheet["Address"], + "logo_path": logo_path, + "current_epc": data_sheet["Current EPC Band"], + "current_sap": data_sheet["Current SAP Rating"], + "potential_epc": "A", # TODO PLACEHOLDER + "potential_sap": 91, # TODO PLACEHOLDER + } + ) diff --git a/survey_report/extraction/detect_report_type.py b/survey_report/extraction/detect_report_type.py new file mode 100644 index 00000000..434a3fb4 --- /dev/null +++ b/survey_report/extraction/detect_report_type.py @@ -0,0 +1,22 @@ +import re + + +def detect_report_type(first_page): + """ + Detects the type of report based on the first page of the report + :param first_page: + :return: + """ + # Set up for the minute to handle quidos files. We have the Elmhurst logic so we can introduce + # this when we need + + if re.match( + r"^Created \d{2}/\d{2}/\d{4} for Quidos Ltd using Argyle software BRE approved calculator", + first_page + ): + return "quidos_site_notes" + + if re.search(r"\nIQ-Energy\nEnergy Performance Report\nPage 1 of 1", first_page): + return "quidos_epr" + + return None diff --git a/survey_report/extraction/quidos.py b/survey_report/extraction/quidos.py new file mode 100644 index 00000000..2e772886 --- /dev/null +++ b/survey_report/extraction/quidos.py @@ -0,0 +1,256 @@ +import re + + +class SiteNotesExtractor: + """ + Extracts SAP rating, carbon emissions, and building dimensions from an EPC summary report. + """ + + def __init__(self, pdf_text): + """ + Initializes the SiteNotesExtractor with the extracted PDF text. + """ + self.text = pdf_text + self.data = {} + + def extract_sap_rating(self): + """ + Extracts the current and potential SAP rating from the report. + """ + pattern = re.search(r"Current SAP rating\s*([A-G])\s*(\d+)\s*Potential SAP rating\s*([A-G])\s*(\d+)", self.text) + + if not pattern: + raise ValueError("No SAP rating found in the report") + + self.data.update({ + "Current EPC Band": pattern.group(1), + "Current SAP Rating": int(pattern.group(2)), + "Potential EPC Band": pattern.group(3), + "Potential SAP Rating": int(pattern.group(4)), + }) + + def extract_carbon_emissions(self): + """ + Extracts the current and adjusted annual carbon emissions (TCO2). + """ + pattern = re.search(r"Current annual emissions\s*([\d.]+)\s*\(TCO2\)", self.text) + + if not pattern: + raise ValueError("No carbon emissions found in the report") + + self.data.update({ + "Current Carbon Emissions (TCO2)": float(pattern.group(1)), + }) + + def extract_building_dimensions(self): + """ + Extracts dimensions for each building part and stores them in a list. + Handles Main Property and multiple extensions. + """ + + # Locate the Dimensions section + dimensions_section = re.search( + r"Dimension Type (?:internal|external)\nPart Floor Area \(m2\) Room Height \(m\) Loss Perimeter \(m\) " + r"Party Wall " + r"Length \(m\)\n" + r"(.*?)\n5\.0 Conservatory", self.text, re.DOTALL + ) + + if not dimensions_section: + raise ValueError("Failed to locate the dimensions section in the text.") + + dimensions_text = dimensions_section.group(1) + + # Pattern to match each building part (Main Property, Extension 1, Extension 2, etc.) + building_part_pattern = re.compile( + r"(Main Property|Extension \d+)\s*(?:Property)?\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + ) + + building_parts = [] + for match in building_part_pattern.finditer(dimensions_text): + to_append = { + "Building Part": match.group(1).strip(), + "Part Floor Area (m2)": float(match.group(2)), + "Room Height (m)": float(match.group(3)), + "Loss Perimeter (m)": float(match.group(4)), + "Party Wall Length (m)": float(match.group(5)), + } + # We calculate the heat loss area + to_append["Heat Loss Area (m2)"] = to_append["Loss Perimeter (m)"] * to_append["Room Height (m)"] + building_parts.append(to_append) + + if not building_parts: + raise ValueError("No building dimensions found in the report") + + self.data["Building Dimensions"] = building_parts + # We calculate some totals + self.data["Total Building Dimensions"] = { + "floor_area": sum([part["Part Floor Area (m2)"] for part in building_parts]), + "heat_loss_area": sum([part["Heat Loss Area (m2)"] for part in building_parts]), + } + + def extract_bills_estimate(self): + """ + Extracts the estimated annual energy costs (£) from the report. + """ + pattern = re.search(r"Current annual energy costs £\s*([\d,.]+)", self.text) + + if not pattern: + raise ValueError("No bills estimate found in the report") + + self.data["Estimated Annual Energy Cost (£)"] = float(pattern.group(1).replace(",", "")) + + def extract_all(self): + """ + Runs all extraction methods and returns a dictionary with extracted data. + """ + self.extract_sap_rating() + self.extract_carbon_emissions() + self.extract_bills_estimate() + self.extract_building_dimensions() + + # Extract specific measures + # Primary wall + # Secondary wall + # Roof + # Floor + # Heating system + # Hot water system + # Windows + # Doors + # Lighting + # Ventilation + # Solar + + return self.data + + def extract_walls(self): + """ + Extracts wall type, insulation, dry-lining, and thickness for each building part, + including any alternative wall details within the 7.0 Walls section of the summary PDF text. + """ + + text = self.text + wall_data = [] + + # Isolate the 7.0 Walls section + wall_section_match = re.search(r"7\.0 Walls\n(.*?)\n8\.0 Roofs", text, re.DOTALL) + if not wall_section_match: + raise ValueError("Failed to locate the walls section in the text.") + + wall_section = wall_section_match.group(1) + + # Define patterns to match walls for each building part + wall_pattern = re.compile( + r"(?P
Main Property(?: Alternative)?|Extension \d+)\s*\n" + r"(?:Construction\s*(?P[^\n]*)\n)?" + r"(?:Insulation\s*(?P[^\n]*)\n)?" + r"(?:Insulation Thickness\(mm\)\s*(?P[^\n]*)\n)?" + r"(?:Wall Thickness Measured\?\s*(?P[^\n]*)\n)?" + r"(?:Wall Thickness\(mm\)\s*(?P\d+))?", + re.MULTILINE + ) + + # TODO: We aren't effectively picking up alternative walls + # alt_wall_pattern = re.compile( + # r"Alternative Wall Sheltered\s*.*?\n" + # r".*?Construction\s*(?P[^\n]*)\n" + # r"Insulation\s*(?P[^\n]*)\n" + # r"Insulation Thickness\(mm\)\s*(?P[^\n]*)\n" + # r"Wall Thickness Measured\?\s*(?P[^\n]*)\n" + # r"Wall Thickness\(mm\)\s*(?P\d+)?", + # re.MULTILINE + # ) + + for match in wall_pattern.finditer(wall_section): + building_part = match.group("section") + # has_alternative_wall = "Alternative" in building_part + building_part = "Main Property" if "Main Property" in building_part else building_part + + wall_entry = { + "Building Part": building_part, + "Wall Type": match.group("construction") or "Unknown", + "Wall Insulation": match.group("insulation") or "Unknown", + "Insulation Thickness (mm)": match.group("insulation_thickness") or "Unknown", + "Wall Thickness Measured": match.group("thickness_measured") or "Unknown", + "Wall Thickness (mm)": int(match.group("thickness")) if match.group("thickness") and match.group( + "thickness").isdigit() else None, + "Alternative Wall Type": None, + "Alternative Wall Insulation": None, + "Alternative Insulation Thickness (mm)": None, + "Alternative Wall Thickness Measured": None, + "Alternative Wall Thickness (mm)": None, + } + + # Check if an alternative wall section exists + # if has_alternative_wall: + # alt_match = alt_wall_pattern.search(wall_section, match.end()) + # if alt_match: + # wall_entry["Alternative Wall Type"] = alt_match.group("alt_construction") or "Unknown" + # wall_entry["Alternative Wall Insulation"] = alt_match.group("alt_insulation") or "Unknown" + # wall_entry["Alternative Insulation Thickness (mm)"] = alt_match.group( + # "alt_insulation_thickness") or "Unknown" + # wall_entry["Alternative Wall Thickness Measured"] = alt_match.group( + # "alt_thickness_measured") or "Unknown" + # wall_entry["Alternative Wall Thickness (mm)"] = int( + # alt_match.group("alt_thickness")) if alt_match.group("alt_thickness") and alt_match.group( + # "alt_thickness").isdigit() else None + + wall_data.append(wall_entry) + + return wall_data + + +class EPRExtractor: + """ + Extracts space heating, water heating, and address from an Energy Performance Report (EPR). + """ + + def __init__(self, pdf_text): + """ + Initializes the EPRExtractor with the extracted PDF text. + """ + self.text = pdf_text + self.data = {} + + def extract_heating_consumption(self): + """ + Extracts space heating and water heating values from the report. + """ + pattern = re.search( + r"Space Heating\(KWH\)\s*([\d,]+).*?\nWater Heating\(KWH\)\s*([\d,]+)", + self.text, + re.DOTALL + ) + + if not pattern: + raise ValueError("No heating data found in the report") + + self.data.update({ + "Space Heating (KWH)": int(pattern.group(1).replace(",", "")), + "Water Heating (KWH)": int(pattern.group(2).replace(",", "")) + }) + + def extract_address(self): + """ + Extracts the full address from the report. + """ + pattern = re.search( + r"Address\s*(.*?)\nTown\s*(.*?)\n", + self.text, + re.DOTALL + ) + + if not pattern: + raise ValueError("No address found in the report") + + full_address = pattern.group(1).strip() + self.data["Address"] = full_address + + def extract_all(self): + """ + Runs all extraction methods and returns a dictionary with extracted data. + """ + self.extract_address() + self.extract_heating_consumption() + return self.data diff --git a/survey_report/requirements.txt b/survey_report/requirements.txt new file mode 100644 index 00000000..e69de29b diff --git a/survey_report/template.html b/survey_report/template.html new file mode 100644 index 00000000..5d3b6c63 --- /dev/null +++ b/survey_report/template.html @@ -0,0 +1,123 @@ + + + + + + Domna Energy Report + + + + +
+ +
+
+

Domna Energy Report

+

${address}

+
+ +
+ + +
+
+
Current EPC Rating
+
${current_epc}
+
SAP ${current_sap}
+
+ +
+
Potential EPC Rating
+
${potential_epc}
+
SAP ${potential_sap}
+
+
+ +
+ + + diff --git a/utils/OsmosisCondtionReportParser.py b/utils/OsmosisCondtionReportParser.py new file mode 100644 index 00000000..4d8873a2 --- /dev/null +++ b/utils/OsmosisCondtionReportParser.py @@ -0,0 +1,49 @@ +import re +import boto3 +import PyPDF2 +import fitz + + +class OsmosisConditionReportParser: + + def __init__(self, filekey, bucket_name=None): + self.s3_client = boto3.client('s3') + self.bucket_name = bucket_name + self.filekey = filekey + self.pdf_text = None + + self._read_file() + + def _read_file(self): + """ + Reads the XML file either locally or from S3 and parses it using minidom. + + Raises: + ValueError: If the file cannot be found, read, or parsed. + """ + + chunk_size = 10 + + try: + if self.bucket_name: + # Read from S3 + raise NotImplementedError("Imeplement me") + else: + + with fitz.open(self.filekey) as pdf: + text = "" + for page in pdf: + text += page.get_text() + + # Parse the XML content using minidom + self.pdf_text = text + except FileNotFoundError: + raise ValueError(f"Local file not found: {self.filekey}") + except Exception as e: + raise ValueError(f"An error occurred while reading or parsing the XML: {e}") + + def extract(self): + return { + "No. of Bedrooms": int(re.search(r"No\. of Bedrooms \(Total\)\s*(\d+)", self.pdf_text).group(1)), + "Risk Assessment Pathway": re.search(r"Risk\s*Assessment\s*Pathway\s*([A-Z])", self.pdf_text).group(1) + } diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py new file mode 100644 index 00000000..2e849ef5 --- /dev/null +++ b/utils/file_data_extraction.py @@ -0,0 +1,1150 @@ +import PyPDF2 +import re +import pdfplumber +from collections import Counter +from utils.logger import setup_logger +from xml.dom.minidom import parseString +from pdf2image import convert_from_path +from pytesseract import image_to_string + +logger = setup_logger() + +""" +This script contains functions used to extract data from retrofit survey files, including EPRs, +summary reports, etc +""" + + +def is_elmhurst_energy_report(text): + """ + Determines if the provided text indicates that the PDF is an Energy Report. + Returns True if the text contains 'Energy Report'. + """ + return text.startswith("ENERGY REPORT") + + +def is_elmhurst_summary_report(text): + """ + Determines if the provided text indicates that the PDF is a Summary Report. + """ + return text.startswith("Summary Information") + + +def is_osmosis_condition_report(text): + """ + Determines if the provided text indicates that the PDF is a Condition Report. + """ + return text.startswith("OsmosisACDNEWPAS2035ConditionReport") or text.startswith("OsmosisACDPAS2035ConditionReport") + + +def is_elmhurst_evidence_report(text): + """ + Determines if the provided text indicates that the PDF is an Elmhurst Evidence Report. + """ + return text.startswith("RdSAP Evidence Report") + + +def is_pulse_air_permeability(text): + """ + Determines if the provided text indicates that the PDF is a Pulse Air Permeability Report. + """ + return text.startswith("Air Permeability Test Report @O PULSE") + + +def is_elmhurst_project_handover(text): + """ + Determines if the provided text indicates that the PDF is an Elmhurst Project Handover Report. + """ + return "Retrofit_Project_Handover" in text or "Retrofit Project Handover" in text + + +def is_core_logic_pas_assessment_report(text): + """ + Determines if the provided text indicates that the PDF is a PAS Assessment Report. + """ + return text.startswith("Generated Using CoreLogic UK PAS Assessment") + + +def detect_pdf_report_type(pdf_path): + """ + Detects the type of report based on content or filename. + :param pdf_path: String path to the PDF file + :return: String type of the report ("epr", "summary", or None) + """ + # Attempt to read the first page of the PDF to determine type + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + first_page_text = reader.pages[0].extract_text() if reader.pages else "" + + if first_page_text == "": + # Convert PDF pages to images + logger.info("Extracting text from PDF images..., this may take a moment.") + pages = convert_from_path(pdf_path, dpi=300) + if pages: + first_page_text = image_to_string(pages[0]) + + if is_elmhurst_energy_report(first_page_text): + return "elmhurst epr" + elif is_elmhurst_summary_report(first_page_text): + return "elmhurst summary report" + elif is_osmosis_condition_report(first_page_text): + return "osmosis condition report" + elif is_elmhurst_evidence_report(first_page_text): + return "elmhurst evidence report" + elif is_pulse_air_permeability(first_page_text): + return "pulse air permeability" + elif is_elmhurst_project_handover(first_page_text): + return "elmhurst project handover" + elif is_core_logic_pas_assessment_report(first_page_text): + return "core logic pas assessment report" + + return None + + +def detect_xml_report_type(xml_path): + """ + Detects the type of XML report based on content or filename. + :param xml_path: String path to the XML file + :return: String type of the report ("full sap xml", or None) + """ + # Attempt to read the first page of the PDF to determine type + with open(xml_path, "r") as file: + contents = file.read() + + contents = parseString(contents) + product_tag_search = contents.getElementsByTagName("Product") + if product_tag_search: + if product_tag_search[0].firstChild.nodeValue == "Sap 2012 Desktop": + return "full sap xml" + + raise Exception("Not implemented") + + +def is_pdf(filename): + """ + Determines if the provided filename is a PDF file. + """ + return filename.endswith(".pdf") + + +def is_xml(filename): + """ + Determines if the provided filename is an XML file. + """ + return filename.endswith(".xml") + + +class ElmhurstEprExtractor: + """ + A utility class for extracting specific data from Elmhurst Energy Performance Reports (EPR). + """ + + def __init__(self, file_path): + self.file_path = file_path + + @staticmethod + def extract_window_age_description(windows_text): + """ + Extracts the most common window age description and its proportion. + """ + windows_text = windows_text.replace("\n", "") + window_descriptions = [ + "Double post or during 2002", + "Double pre 2002", + "Double with unknown install date", + "Secondary glazing", + "Triple glazing", + "Single glazing", + ] + description_counts = Counter() + for description in window_descriptions: + matches = re.findall(re.escape(description), windows_text) + description_counts[description] = len(matches) + + if not description_counts or not sum(description_counts.values()): + raise ValueError("Failed to extract window data.") + + most_common_description, window_count = description_counts.most_common(1)[0] + window_proportion = window_count / sum(description_counts.values()) * 100 + + if window_proportion == 100: + second_most_common_description = None + second_most_common_proportion = 0 + else: + second_most_common_description, second_window_count = description_counts.most_common(2)[1] + second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100 + + return { + "Window Age Description": most_common_description, + "Window Age Description Proportion (%)": window_proportion, + "Secondary Window Age Description": second_most_common_description, + "Secondary Window Age Description Proportion (%)": second_most_common_proportion, + "Number of Windows": sum(description_counts.values()) + } + + @staticmethod + def extract_building_parts(text): + """ + Extracts building parts and associated dimensions from the provided text. + """ + data = [] + building_part_pattern = re.compile( + r"Construction details: Building part: (.*?)\nFloor Area \[m2\] Room Height \[m\] Perimeter \[m\] Party " + r"Wall Length \[m\]\n(.*?)(?=Construction details|Data inputs|$)", + re.DOTALL + ) + for match in building_part_pattern.finditer(text): + part_name = match.group(1).strip() + floor_data = match.group(2) + room_in_roof_match = re.search(r"Room\(s\) in Roof area:\s*([\d.]+)", part_name) + if room_in_roof_match: + floor_area = float(room_in_roof_match.group(1)) + cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() + data.append({ + "Building Part": cleaned_part_name, + "Floor Level": "Room in Roof", + "Floor Area (m2)": floor_area, + "Room Height (m)": None, + "Perimeter (m)": None, + "Party Wall Length (m)": None + }) + else: + cleaned_part_name = re.sub(r" - built in.*", "", part_name).strip() + + floor_pattern = re.compile( + r"(Lowest floor|First floor|Second floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + ) + for floor_match in floor_pattern.finditer(floor_data): + floor_level = floor_match.group(1) + floor_area = float(floor_match.group(2)) + room_height = float(floor_match.group(3)) + perimeter = float(floor_match.group(4)) + party_wall_length = float(floor_match.group(5)) + data.append({ + "Building Part": cleaned_part_name, + "Floor Level": floor_level, + "Floor Area (m2)": floor_area, + "Room Height (m)": room_height, + "Perimeter (m)": perimeter, + "Party Wall Length (m)": party_wall_length + }) + + return data + + @staticmethod + def extract_roof_details(text): + """ + Extracts roof details for each building part in the provided text. + """ + roof_data = [] + building_part_pattern = re.compile( + r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)", + re.DOTALL + ) + for match in building_part_pattern.finditer(text): + part_name = match.group(1).strip() + cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() + part_details = match.group(2) + roof_type_match = re.search(r"Roof Type:\s*(.*?)(?=\n|$)", part_details) + roof_insulation_match = re.search(r"Roof Insulation:\s*(.*?)(?=\n|$)", part_details) + roof_insulation_thickness_match = re.search(r"Roof Insulation Thickness:\s*(.*?)(?=\n|$)", part_details) + + roof_data.append({ + "Building Part": cleaned_part_name, + "Roof Type": roof_type_match.group(1).strip() if roof_type_match else None, + "Roof Insulation": roof_insulation_match.group(1).strip() if roof_insulation_match else None, + "Roof Insulation Thickness": roof_insulation_thickness_match.group( + 1).strip() if roof_insulation_thickness_match else None, + }) + + return roof_data + + @staticmethod + def extract_wall_details(text): + """ + Extracts wall details for each building part in the provided text. + """ + wall_data = [] + building_part_pattern = re.compile( + r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)", + re.DOTALL + ) + for match in building_part_pattern.finditer(text): + part_name = match.group(1).strip() + cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip() + part_details = match.group(2) + wall_type_match = re.search(r"Wall Type:\s*(.*?)(?=\n|$)", part_details) + wall_insulation_match = re.search(r"Wall Insulation:\s*(.*?)(?=\n|$)", part_details) + wall_drylining_match = re.search(r"Wall Dry-lining:\s*(.*?)(?=\n|$)", part_details) + wall_thickness_match = re.search(r"Wall Thickness:\s*(\d+)(?=\n|$)", part_details) + + wall_data.append({ + "Building Part": cleaned_part_name, + "Wall Type": wall_type_match.group(1).strip() if wall_type_match else None, + "Wall Insulation": wall_insulation_match.group(1).strip() if wall_insulation_match else None, + "Wall Dry-lining": wall_drylining_match.group(1).strip() if wall_drylining_match else None, + "Wall Thickness": int(wall_thickness_match.group(1)) if wall_thickness_match else None, + }) + + return wall_data + + @staticmethod + def extract_conservatory(text): + """ + Extracts conservatory data from the provided text. + The section is located between "Conservatory" and "Doors". + + Args: + text (str): The full text of the EPR PDF. + + Returns: + dict: A dictionary with conservatory details: + - "Conservatory Present" + - "Conservatory Separated" + - "Conservatory Floor Area" + - "Conservatory Double Glazed" + - "Conservatory Glazed Perimeter" + - "Heated Conservatory Height" + """ + + conservatory_match = re.search(r"Conservatory\s*(.*?)\s*Doors", text, re.DOTALL) + if not conservatory_match: + logger.error("Failed to extract conservatory data.") + raise ValueError("Could not extract conservatory data.") + + conservatory_text = conservatory_match.group(1) + + # Check if conservatory is present + present_match = re.search(r"Conservatory Present:\s*(Yes|No)", conservatory_text) + + if not present_match or present_match.group(1).strip() == "No": + logger.info("Conservatory not present.") + return { + "Conservatory Present": "No", + "Conservatory Separated": "", + "Conservatory Floor Area": 0, + "Conservatory Double Glazed": "", + "Conservatory Glazed Perimeter": 0, + "Heated Conservatory Height": "", + } + + # Extract conservatory details + separated_match = re.search(r"Conservatory Separated:\s*(Yes|No)", conservatory_text) + floor_area_match = re.search(r"Conservatory Floor Area:\s*([\d.]+)", conservatory_text) + double_glazed_match = re.search(r"Conservatory Double Glazed:\s*(Yes|No)", conservatory_text) + glazed_perimeter_match = re.search(r"Conservatory Glazed Perimeter:\s*([\d.]+)", conservatory_text) + height_match = re.search(r"Heated Conservatory Height:\s*(.*?)(?=\n|$)", conservatory_text) + + return { + "Conservatory Present": "Yes", + "Conservatory Separated": separated_match.group(1).strip() if separated_match else "", + "Conservatory Floor Area": float(floor_area_match.group(1)) if floor_area_match else 0, + "Conservatory Double Glazed": double_glazed_match.group(1).strip() if double_glazed_match else "", + "Conservatory Glazed Perimeter": float(glazed_perimeter_match.group(1)) if glazed_perimeter_match else 0, + "Heated Conservatory Height": height_match.group(1).strip() if height_match else "", + } + + @staticmethod + def _extract_heating_details(section_text, default_value=""): + """ + Extracts heating details from a given section of text. + + Args: + section_text (str): The section of text containing heating details. + default_value (str, optional): The default value to return for missing fields. Defaults to "". + + Returns: + dict: A dictionary containing heating system details. + """ + system_search = re.search(r"Main Heating Code\s*(.*?)\n", section_text) + pcdf_search = re.search(r"PCDF boiler Reference\s*(\d+)", section_text) + controls_search = re.search(r"Main Heating Controls\s*(.*?)\n", section_text) + heat_search = re.search(r"Percentage of Heat\s*(\d+)\s*%?", section_text) + + return { + "System": system_search.group(1).strip() if system_search else default_value, + "PCDF Reference": pcdf_search.group(1) if pcdf_search else default_value, + "Controls": controls_search.group(1).strip() if controls_search else default_value, + "% of Heat": int(heat_search.group(1)) if heat_search else 0, + } + + def extract_primary_heating(self, text): + + # Extract Primary Heating Section (Main Heating 1) + primary_heating_section1 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL) + # We may not have a secondary heating + primary_heating_section2 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Secondary\s*Heating", text, re.DOTALL) + primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 + primary_text = primary_heating_section.group(1) + + return self._extract_heating_details(primary_text) + + def extract_secondary_heating_details(self, text): + # Extract Secondary Heating Section (Main Heating 2) + secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL) + + output = {} + if secondary_heating_section is None: + + output["System"] = "" + output["PCDF Reference"] = "" + output["Controls"] = "" + output["% of Heat"] = 0 + + else: + secondary_text = secondary_heating_section.group(1) + output.update( + **self._extract_heating_details(secondary_text) + ) + + output["Heating Code"] = ( + re.search(r"Secondary Heating Code\s*(.*?)\n", text).group(1).strip() + if output["System"] and re.search(r"Secondary Heating Code\s*(.*?)\n", text) + else "" + ) + + return output + + def extract(self): + """ + Extracts all relevant data from the EPR PDF. + + Returns: + dict: A dictionary containing extracted data, including: + - Address and Postcode + - SAP Rating and Primary Energy Use + - Lighting, Doors, Windows, Roof, and Wall Details + - Heating systems (Primary and Secondary) + - Building Parts + """ + data = {} + + with open(self.file_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + text = "".join(page.extract_text() for page in reader.pages) + + data["Assessor Name"] = re.search(r"Created by:\s*(.*?)\n", text).group(1).strip() + data["Assessment Date"] = re.search(r"\nAssessment Date\s*(.*?)\n", text).group(1).strip() + + # Extracting individual components + address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL) + if not address_match: + logger.error("Failed to extract address.") + raise ValueError("Failed to extract address.") + data["Address"] = address_match.group(1).strip() + data["Postcode"] = data["Address"].split(",")[-1].strip() + + # TODO: + data["Region"] = None + data["House Name"] = None + data["House No"] = None + data["Street"] = None + data["Locality"] = None + data["Town"] = None + data["County"] = None + + sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text) + if not sap_match: + logger.error("Failed to extract SAP rating.") + raise ValueError("Failed to extract SAP rating.") + data["Current SAP Rating"] = int(sap_match.group(1)) + + energy_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text) + if not energy_match: + logger.error("Failed to extract primary energy use.") + raise ValueError("Failed to extract primary energy use.") + data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(energy_match.group(1)) + + storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) + if not storeys_match: + logger.error("Failed to extract the number of storeys.") + raise ValueError("Failed to extract the number of storeys.") + data["Number of Storeys"] = int(storeys_match.group(1)) + + fuel_match = re.search(r"TOTAL\s*£(\d+)", text) + if not fuel_match: + logger.error("Failed to extract fuel bill.") + raise ValueError("Failed to extract fuel bill.") + data["Fuel Bill"] = f"£{fuel_match.group(1)}" + + total_doors_match = re.search(r"Total Doors:\s*(\d+)", text) + if not total_doors_match: + logger.error("Failed to extract total doors.") + raise ValueError("Failed to extract total doors.") + data["Total Number of Doors"] = int(total_doors_match.group(1)) + + # Extract Number of Insulated Doors + insulated_doors_match = re.search(r"Insulated Doors:\s*(\d+)", text) + if not insulated_doors_match: + logger.error("Failed to extract insulated doors.") + raise ValueError("Failed to extract insulated doors.") + data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) + + # Get number of lighting outlets and number of fittings needing LEL + lighting_fittings_match = re.search(r"Total number of light fittings\s*(\d+)", text) + if not lighting_fittings_match: + logger.error("Failed to extract lighting.") + raise ValueError("Failed to extract lighting") + data["Number of Light Fittings"] = int(lighting_fittings_match.group(1)) + lel_fittings_match = re.search(r"Total number of L.E.L. fittings\s*(\d+)", text) + if not lel_fittings_match: + logger.error("Failed to extract LEL fittings.") + raise ValueError("Failed to extract LEL fittings.") + data["Number of LEL Fittings"] = int(lel_fittings_match.group(1)) + data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + + windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) + if not windows_section: + logger.error("Failed to extract window data.") + raise ValueError("Failed to extract window data.") + data["Windows"] = self.extract_window_age_description(windows_section.group(1)) + + data["Primary Heating"] = self.extract_primary_heating(text) + data["Secondary Heating"] = self.extract_secondary_heating_details(text) + data["Building Parts"] = self.extract_building_parts(text) + data["Roof Details"] = self.extract_roof_details(text) + data["Wall Details"] = self.extract_wall_details(text) + data["Conservatory"] = self.extract_conservatory(text) + + water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) + if not water_heating_code_match: + logger.error("Failed to extract water heating code.") + raise ValueError("Failed to extract water heating code.") + data["Water Heating Code"] = water_heating_code_match.group(1).strip() + + return data + + +class ElmhurstSummaryReportExtractor: + """ + A utility class for extracting specific data from Elmhurst Energy Performance Reports (EPR). + """ + + def __init__(self, file_path): + self.file_path = file_path + + @staticmethod + def extract_window_age_description(windows_text): + """ + Extracts the most common window age description and its proportion. + + Parameters: + windows_text (str): The text section containing window data. + + Returns: + dict: A dictionary with the most common window age description and its proportion. + """ + # Clean up windows_text by removing line breaks for better pattern matching + windows_text = windows_text.replace("\n", "") + + # Define possible window age descriptions + window_descriptions = [ + "Double post or during 2002", + "Double pre 2002", + "Double with unknown install date", + "Secondary glazing", + "Triple glazing", + "Single glazing", + ] + + # Count occurrences of each description + description_counts = Counter() + for description in window_descriptions: + matches = re.findall(re.escape(description), windows_text) + description_counts[description] = len(matches) + + if not description_counts or not sum(description_counts.values()): + raise ValueError("Failed to extract window data.") + + # Determine the most common description and calculate its proportion + most_common_description, window_count = description_counts.most_common(1)[0] + window_proportion = window_count / sum(description_counts.values()) * 100 + + # Get the second most common and the proportion + if window_proportion == 100: + second_most_common_description = None + second_most_common_proportion = 0 + else: + second_most_common_description, second_window_count = description_counts.most_common(2)[1] + second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100 + + return { + "Window Age Description": most_common_description, + "Window Age Description Proportion (%)": window_proportion, + "Secondary Window Age Description": second_most_common_description, + "Secondary Window Age Description Proportion (%)": second_most_common_proportion, + "Number of Windows": sum(description_counts.values()) + } + + @staticmethod + def extract_primary_heating(text): + primary_heating_section1 = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL) + primary_heating_section2 = re.search(r"Main\s*Heating1\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) + primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2 + if primary_heating_section is None: + raise ValueError("Failed to extract primary heating data.") + + primary_text = primary_heating_section.group(1) + + output = { + 'System': re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group(1).strip(), + 'PCDF Reference': re.search(r"PCDF boiler Reference\s*(\d+)", primary_text).group(1), + 'Controls': re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group(1).strip(), + '% of Heat': int(re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1)) + } + return output + + @staticmethod + def extract_secondary_heating_details(text): + secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL) + + # Defaults + output = { + "System": "", + "PCDF Reference": "", + "Controls": "", + "% of Heat": 0, + "Heating Code": "" + } + if secondary_heating_section is not None: + # Overwrite defaults + secondary_text = secondary_heating_section.group(1) + + main_heating_code_match_secondary = re.search( + r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text + ) + output["System"] = main_heating_code_match_secondary.group(1).strip() + output["PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", secondary_text).group(1) + + second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text) + output["Heating Controls"] = ( + second_heating_controls_match.group(1).strip() if second_heating_controls_match else "" + ) + output["% of Heat"] = int( + re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1) + ) + + secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text) + if output["System"] != "": + output["Heating Code"] = ( + secondary_heating_code_match.group(1).strip() if secondary_heating_code_match else "" + ) + + return output + + @staticmethod + def extract_building_parts(text): + """ + Extracts building parts and associated dimensions from the summary report PDF. + This includes Main Property, multiple extensions if they exist, and Room in Roof areas. + """ + data = [] + + # Locate the Dimensions section + dimensions_section = re.search( + r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL + ) + if not dimensions_section: + raise ValueError("Failed to locate dimensions section in the text.") + + dimensions_text = dimensions_section.group(1) + + # Pattern to extract each building part, starting from Main Property and including extensions + building_part_pattern = re.compile( + r"(Main Property|\d+(?:st|nd|rd|th) Extension)\s*" + r"(.*?)(?=\d+(?:st|nd|rd|th) Extension|5\.0 Conservatory|$)", + re.DOTALL + ) + + # Loop through each building part match, including Main Property and extensions + for match in building_part_pattern.finditer(dimensions_text): + part_name = match.group(1) + floor_data = match.group(2) + + # Pattern to extract floor details: Floor Level, Floor Area, Room Height, Perimeter, Party Wall Length + floor_pattern = re.compile( + r"(1st Floor|Lowest Floor|Second floor):\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)" + ) + + # Extract data for each floor within the building part + for floor_match in floor_pattern.finditer(floor_data): + floor_level = floor_match.group(1) + floor_area = float(floor_match.group(2)) + room_height = float(floor_match.group(3)) + perimeter = float(floor_match.group(4)) + party_wall_length = float(floor_match.group(5)) + + # Append to data list + data.append( + { + "Building Part": part_name, + "Floor Level": floor_level, + "Floor Area (m2)": floor_area, + "Room Height (m)": room_height, + "Perimeter (m)": perimeter, + "Party Wall Length (m)": party_wall_length + } + ) + + # Check specifically for "Room(s) in Roof" entries, which only have Floor Area + room_in_roof_pattern = re.compile(r"Room\(s\) in Roof:\s*([\d.]+)") + room_in_roof_match = room_in_roof_pattern.search(floor_data) + if room_in_roof_match: + floor_area = float(room_in_roof_match.group(1)) + data.append( + { + "Building Part": part_name, + "Floor Level": "Room in Roof", + "Floor Area (m2)": floor_area, + "Room Height (m)": None, # Placeholder for missing data + "Perimeter (m)": None, # Placeholder for missing data + "Party Wall Length (m)": None # Placeholder for missing data + } + ) + + return data + + @staticmethod + def extract_roof_details(text): + """ + Extracts roof type, insulation, and insulation thickness for each building part + in the 8.0 Roofs section of the summary report. + """ + # Define data structure to hold results + roof_data = [] + + # Locate the entire 8.0 Roofs section + roof_section_match = re.search(r"8\.0 Roofs:\n(.*?)(?=\n9\.0 Floors:|$)", text, re.DOTALL) + if not roof_section_match: + return roof_data # Return empty if no roof section is found + + # Extract the roof section and append "9.0 Floors:" as the boundary + roof_section = roof_section_match.group(1).strip() + "\n9.0 Floors:" + + # Define pattern to match each building part's roof entry + building_part_pattern = re.compile( + r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label + r"Type\s+(.*?)(?=\n(?:Insulation|9\.0 Floors:|[A-Z]))" # Matches Roof Type until the next field, label, + # or end + r"(?:\nInsulation\s+(.*?)(?=\n(?:Insulation Thickness|9\.0 Floors:|[A-Z])))?" # Optional Insulation + r"(?:\nInsulation Thickness\s+(.*?)(?=\n(?:9\.0 Floors:|[A-Z])))?", # Optional Insulation Thickness + re.DOTALL + ) + + # Extract each building part's data + for match in building_part_pattern.finditer(roof_section): + part_name = match.group(1).strip() # Building part label + roof_type = match.group(2).strip() # Roof Type + roof_insulation = match.group(3).strip() if match.group(3) else None # Optional Insulation + roof_insulation_thickness = match.group(4).strip() if match.group(4) else None # Optional Thickness + + # Cleaning to handle annoying cases when it comes out like this: + # 'A Another dwelling above\n1st Extension' + if roof_type.startswith("A Another dwelling above"): + roof_type = "A Another dwelling above" + + # Store results for this building part + roof_data.append( + { + "Building Part": part_name, + "Roof Type": roof_type, + "Roof Insulation": roof_insulation, + "Roof Insulation Thickness": roof_insulation_thickness, + } + ) + + return roof_data + + @staticmethod + def extract_wall_details(text): + """ + Extracts wall type, insulation, dry-lining, and thickness for each building part, + including any alternative wall details within the 7.0 Walls section of the summary PDF text. + """ + # Define data structure to hold all building part wall entries + wall_data = [] + + # Locate the entire 7.0 Walls section + wall_section = re.search(r"7\.0 Walls:\n(.*?)\n8\.0 Roofs:", text, re.DOTALL).group(1) + + # Define pattern to match each building part's wall entry within the section + building_part_pattern = re.compile( + r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label + r"Type\s+(.*?)\n" # Matches main wall Type + r"Insulation\s+(.*?)\n" # Matches main wall Insulation + r"(Dry-lining\s+(.*?)\n)?" # Optional main wall Dry-lining + r"Wall Thickness Unknown\s+(.*?)\n" # Matches main wall Thickness Unknown + r"Wall Thickness \[mm\]\s+(\d+)", # Matches main wall Thickness + re.DOTALL + ) + + # Define pattern to capture alternative wall details, if present + alternative_wall_pattern = re.compile( + r"Alternative Wall Area.*?\n" # Matches start of alternative wall section + r"Alternative Type\s+(.*?)\n" # Matches alternative wall Type + r"Alternative Insulation\s+(.*?)\n" # Matches alternative wall Insulation + r"(Alternative Dry-lining\s+(.*?)\n)?" # Optional Alternative Dry-lining + r"Alternative Wall Thickness Unknown\s+(.*?)\n" # Matches alternative wall Thickness Unknown + r"Alternative Wall Thickness\s+(\d+)", # Matches alternative wall Thickness + re.DOTALL + ) + + # Find all building part entries within the 7.0 Walls section + for match in building_part_pattern.finditer(wall_section): + wall_label = match.group(1).strip() + main_wall_type = match.group(2).strip() + main_wall_insulation = match.group(3).strip() + main_wall_dry_lining = match.group(5).strip() if match.group(5) else "N/A" + main_wall_thickness_unknown = match.group(6).strip() + main_wall_thickness = int(match.group(7)) + + # Initialize dictionary for this wall entry + wall_entry = { + "Building Part": wall_label, + "Wall Type": main_wall_type, + "Wall Insulation": main_wall_insulation, + "Wall Dry-lining": main_wall_dry_lining, + "Wall Thickness Unknown": main_wall_thickness_unknown, + "Wall Thickness (mm)": main_wall_thickness, + "Alternative Wall Type": None, + "Alternative Wall Insulation": None, + "Alternative Wall Dry-lining": "N/A", + "Alternative Wall Thickness Unknown": None, + "Alternative Wall Thickness (mm)": None, + } + + # Check if there's an alternative wall section following this wall entry + alt_match = alternative_wall_pattern.search(wall_section, match.end()) + if alt_match: + wall_entry["Alternative Wall Type"] = alt_match.group(1).strip() + wall_entry["Alternative Wall Insulation"] = alt_match.group(2).strip() + wall_entry["Alternative Wall Dry-lining"] = alt_match.group(4).strip() if alt_match.group(4) else "N/A" + wall_entry["Alternative Wall Thickness Unknown"] = alt_match.group(5).strip() + wall_entry["Alternative Wall Thickness (mm)"] = int(alt_match.group(6)) + + # Append each building part as a dictionary in the wall_data list + wall_data.append(wall_entry) + + return wall_data + + @staticmethod + def extract_conservatory(text): + """ + Extracts conservatory data from the provided text. + The section is located between "5.0 Conservatory" and "7.0 Walls". + + Args: + text (str): The full text of the Summary Report PDF. + + Returns: + dict: A dictionary with conservatory details: + - "Conservatory Present" + - "Conservatory Separated" + - "Conservatory Floor Area" + - "Conservatory Double Glazed" + - "Conservatory Glazed Perimeter" + - "Heated Conservatory Height" + """ + + # Extract the section between "5.0 Conservatory" and "7.0 Walls" + conservatory_match = re.search(r"5\.0 Conservatory:(.*?)7\.0 Walls:", text, re.DOTALL) + if not conservatory_match: + logger.error("Failed to extract conservatory data.") + raise ValueError("Could not extract conservatory data.") + + conservatory_text = conservatory_match.group(1) + + # Check if conservatory is present + present_match = re.search(r"Is there a conservatory\?\s*(Yes|No)", conservatory_text, re.IGNORECASE) + + if not present_match or present_match.group(1).strip().lower() == "no": + return { + "Conservatory Present": "No", + "Conservatory Separated": "", + "Conservatory Floor Area": 0, + "Conservatory Double Glazed": "", + "Conservatory Glazed Perimeter": 0, + "Heated Conservatory Height": "", + } + + # If we get here, raise a temporary exception since we've not seen a case of this, so should make sure + # this is correct + + separated_match = re.search(r"Is it thermally separated\?\s*(Yes|No)", conservatory_text, re.IGNORECASE) + floor_area_match = re.search(r"Floor Area \[m2\]\s*([\d.]+)", conservatory_text, re.IGNORECASE) + double_glazed_match = re.search(r"Double Glazed\s*(Yes|No)", conservatory_text, re.IGNORECASE) + glazed_perimeter_match = re.search(r"Glazed Perimeter \[m\]\s*([\d.]+)", conservatory_text, re.IGNORECASE) + height_match = re.search(r"Room Height\s*(.*?)(?=\n|$)", conservatory_text, re.IGNORECASE) + + return { + "Conservatory Present": "Yes", + "Conservatory Separated": separated_match.group(1).strip() if separated_match else "", + "Conservatory Floor Area": float(floor_area_match.group(1)) if floor_area_match else 0, + "Conservatory Double Glazed": double_glazed_match.group(1).strip() if double_glazed_match else "", + "Conservatory Glazed Perimeter": float(glazed_perimeter_match.group(1)) if glazed_perimeter_match else 0, + "Heated Conservatory Height": height_match.group(1).strip() if height_match else "", + } + + def extract(self): + """ + Extracts specific data from the provided PDF file. + Data includes: + - Current SAP rating + - Fuel Bill + - Address + """ + + data = {} + with (open(self.file_path, "rb") as file): + reader = PyPDF2.PdfReader(file) + text = "" + for page in reader.pages: + text += page.extract_text() + + # Match and extract + name_match = re.search(r"Name:\s*([A-Za-z\s]+)\s*Title:\s*([A-Za-z\.]+)", text) + if not name_match: + raise ValueError("Couldn't extract surveyor name") + data["Assessor Name"] = name_match.group(2).strip() + " " + name_match.group(1).strip() + data["Assessment Date"] = re.search(r"Inspection Date:\s*(.*?)\n", text).group(1).strip() + + # Address and postcode + postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text) + postcode = postcode.group(1).strip() if postcode else "" + + region = re.search(r"Region:\s*(.*?)\nHouse Name:", text) + region = region.group(1).strip() if region else "" + + house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text) + house_name = house_name.group(1).strip() if house_name else "" + + house_no = re.search(r"House No:\s*(.*?)\nStreet:", text) + house_no = house_no.group(1).strip() if house_no else "" + + street = re.search(r"Street:\s*(.*?)\nLocality:", text) + street = street.group(1).strip() if street else "" + + locality = re.search(r"Locality:\s*(.*?)\nTown:", text) + locality = locality.group(1).strip() if locality else "" + + town = re.search(r"Town:\s*(.*?)\nCounty:", text) + town = town.group(1).strip() if town else "" + + county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text) + county = county.group(1).strip() if county else "" + + # Clean extracted values and remove any prefixes + address_parts = [ + house_no, + house_name, + street, + locality, + town, + county, + region, + postcode + ] + + # Join non-empty parts with a comma + data["Address"] = ", ".join([part for part in address_parts if part]) + data["Postcode"] = postcode + data["Region"] = region + data["House Name"] = house_name + data["House No"] = house_no + data["Street"] = street + data["Locality"] = locality + data["Town"] = town + data["County"] = county + + # Extract Current SAP rating + sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) + if not sap_match: + raise ValueError("Could not extract SAP rating") + data["Current SAP Rating"] = sap_match.group(1).split(" ")[1] + + # We don't have primary energy in the summary report + data['Primary Energy Use Intensity (kWh/m2/yr)'] = None + + # Number of storeys + storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text) + if not storeys_match: + raise ValueError("Could not extract number of storeys") + data["Number of Storeys"] = int(storeys_match.group(1)) + + # Extract Fuel Bill + fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text) + if not fuel_bill_match: + raise ValueError("Could not extract fuel bill") + data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" + + # Extract Total Number of Doors + total_doors_match = re.search(r"Total Number of Doors\s*(\d+)", text) + if not total_doors_match: + raise ValueError("Could not extract total number of doors") + data["Total Number of Doors"] = int(total_doors_match.group(1)) + + # Extract Number of Insulated Doors + insulated_doors_match = re.search(r"Number of Insulated Doors\s*(\d+)", text) + if not insulated_doors_match: + raise ValueError("Could not extract number of insulated doors") + data["Number of Insulated Doors"] = int(insulated_doors_match.group(1)) + + # lighting + data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1)) + data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1)) + data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"] + + windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL) + if not windows_section: + raise ValueError("Failed to extract window data.") + data["Windows"] = self.extract_window_age_description(windows_section.group(1)) + + data["Primary Heating"] = self.extract_primary_heating(text) + data["Secondary Heating"] = self.extract_secondary_heating_details(text) + data["Building Parts"] = self.extract_building_parts(text) + data["Roof Details"] = self.extract_roof_details(text) + data["Wall Details"] = self.extract_wall_details(text) + data["Conservatory"] = self.extract_conservatory(text) + + water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text) + if not water_heating_code_match: + raise ValueError("Failed to extract water heating code.") + + data["Water Heating Code"] = water_heating_code_match.group(1).strip() + + return data + + +class PulseAirPermeabilityExtractor: + """ + A utility class for extracting specific data from Pulse Air Permeability Test Reports. + """ + + def __init__(self, file_path): + self.file_path = file_path + + @staticmethod + def extract_table(text): + patterns = { + "Air Leakage Rate": r"Air Leakage Rate\s*([\d,@.]+)\s*m/h\s*([\d,@.]+)\s*m3/h", + "Air Permeability": r"Air Permeability\s*([\d,@.]+)\s*=.*?\s*([\d,@.]+)\s*m\?/m\?h", + "Air Changes per Hour": r"Air Changes per Hour\s*([\d,@.]+)\s*([\d,@.]+)", + "Equivalent Leakage Area": r"Equivalent Leakage Area\s*([\d,@.]+)\s*([\d,@.]+)", + "Calculation Uncertainty": r"Calculation Uncertainty\s*([\d,@.]+)\s*([\d,@.]+)", + } + + # Initialize results dictionary + table_data = [] + + # Parse each metric using the corresponding regex + for metric, pattern in patterns.items(): + match = re.search(pattern, text) + if match: + # Extract the two column values + first_value = match.group(1) + second_value = match.group(2) + + # Post-process values: replace '@' with '0' and remove commas + first_value = first_value.replace("@", "0").replace(",", "") + second_value = second_value.replace("@", "0").replace(",", "") + + table_data.append( + { + "Metric": metric, + "Measured @ 4PA": first_value, + "Extrapolated @ 50PA": second_value, + } + ) + else: + raise ValueError(f"Could not extract metric: {metric}") + + return table_data + + def extract(self): + # Extract the pdf using tesseract + logger.info("Extracting data from pdf image - this may take a while...") + pages = convert_from_path(self.file_path, dpi=300) + # Extract all of the pages + text = "" + for page in pages: + text += image_to_string(page) + + # We extract the air permeability reading + results_table = self.extract_table(text) + data = { + "Results Table": results_table + } + + return data + + +class ElmhurstProjectHandoverExtractor: + """ + A utility class for extracting specific data from The Elmhurst Project Handover document + """ + + def __init__(self, file_path): + self.file_path = file_path + + def extract(self): + + with (open(self.file_path, "rb") as file): + reader = PyPDF2.PdfReader(file) + text = "" + for page in reader.pages: + text += page.extract_text() + + data = {} + + # Regex patterns + patterns = { + "Retrofit Coordinator Name": r"Retrofit Coordinator Name:\s*(.+)", + "Retrofit Coordinator ID": r"Retrofit Coordinator ID:\s*(\d+)", + "Measures Fitted": r"Measure\(s\) Fitted:\s*([\s\S]*?)\nRetrofit Assessor Name:", + "Designer Name": r"Designer Name\(s\):\s*(.+)", + "Installer Name": r"Installer Name\(s\):\s*(.+)", + } + + # Extract data + for key, pattern in patterns.items(): + match = re.search(pattern, text) + if not match: + raise ValueError(f"Could not match {key}") + if match: + if key == "Measures Fitted": + # Special handling for multiline measures + measures = re.findall(r"[\u2022\u00b7\u25cf\uf0b7]\s*(.+)", match.group(1)) + measures = [m.strip() for m in measures] + data[key] = measures + else: + data[key] = match.group(1).strip() if match else "" + + return data + + +class CoreLogicPasAssessmentReportExtractor: + """ + A utility class for extracting specific data from CoreLogic PAS Assessment Reports. + """ + + def __init__(self, file_path): + self.file_path = file_path + + def extract(self): + data = {} + + with pdfplumber.open(self.file_path) as pdf: + for page in pdf.pages: + tables = page.extract_tables() + if tables: # If tables are detected on the page + for table in tables: + for row in table: + # Check if the row contains "Number of bedrooms" + if any("Number of bedrooms" in str(cell) for cell in row): + # Extract the corresponding value by filtering out None and non-relevant cells + for cell in row: + if cell and cell.strip().isdigit(): # Check if cell contains a numeric value + data["Number of bedrooms"] = int(cell.strip()) + break # Stop further processing once value is found + + return data diff --git a/utils/fullSapParser.py b/utils/fullSapParser.py new file mode 100644 index 00000000..540eff6f --- /dev/null +++ b/utils/fullSapParser.py @@ -0,0 +1,306 @@ +import boto3 +from xml.dom.minidom import parseString + +PROPERTY_AGE_BAND = { + "A": "before 1900", + "B": "1900-1929", + "C": "1930-1949", + "D": "1950-1966", + "E": "1967-1975", + "F": "1976-1982", + "G": "1983-1990", + "H": "1991-1995", + "I": "1996-2002", + "J": "2003-2006", + "K": "2007-2011", + "L": "2012 onwards" +} + +POSITION_OF_FLAT = { + "TopFloorFlat": "(top floor)" +} + +MAINHEATING_LOOKUP = { + "SEB": "Electric (SEB modern slimline storage heaters)" +} + +WINDOWS_YEAR_LOOKUP = { + "unknown install date": "unknown year", + "unknown install": "unknown year", + "post or during 2002": "2002 onwards", +} + + +class FullSapParser: + full_address = None + archetype = None + age_band = None + unheated_corridor = None + property_type = None + built_form = None + + # ventilation + mechanical_ventilation = None + cross_ventilation = None + night_ventilation = None + + # dimensions + number_of_storeys = None + property_dimensions = None + + # fabric + low_energy_lighting = None + + # Heating + heating1 = None + cylinder = None + cylinder_stat = None + + def __init__(self, filekey, bucket_name=None): + self.s3_client = boto3.client('s3') + self.bucket_name = bucket_name + self.filekey = filekey + self.full_sap = None + + self._read_file() + + def _read_file(self): + """ + Reads the XML file either locally or from S3 and parses it using minidom. + + Raises: + ValueError: If the file cannot be found, read, or parsed. + """ + try: + if self.bucket_name: + # Read from S3 + response = self.s3_client.get_object(Bucket=self.bucket_name, Key=self.filekey) + xml_content = response['Body'].read() + else: + # Read locally + with open(self.filekey, "r") as f: + xml_content = f.read() + + # Parse the XML content using minidom + self.full_sap = parseString(xml_content) + except FileNotFoundError: + raise ValueError(f"Local file not found: {self.filekey}") + except Exception as e: + raise ValueError(f"An error occurred while reading or parsing the XML: {e}") + + def extract(self, _return=True): + self.get_address() + self.get_archetype() + self.get_age_band() + self.get_unheated_corridor() + self.get_heating_1() + self.get_ventilation() + self.get_floor_area() + self.get_low_energy_lighting() + self.get_cylinder() + + if _return: + return { + "Property Type": self.property_type, + "Built Form": self.built_form, + "Age Band": self.age_band, + } + + def get_address(self): + if not self.full_sap: + raise ValueError("You need to read the file first") + + address = self.full_sap.getElementsByTagName("AddressAsDesigned") + if len(address) != 1: + raise ValueError("Non-unique address tag found - investigate me") + + address = address[0] + data = {} + for node in address.childNodes: + if node.nodeType == node.ELEMENT_NODE: + data[node.nodeName] = node.firstChild.nodeValue if node.firstChild else None + + self.full_address = " ".join( + [ + x.title() for x in [data["AddressLine1"], data["AddressLine2"], data["AddressLine3"], data["Town"]] + if x is not None + ] + ) + " " + data["Postcode"] + + def get_archetype(self): + if not self.full_sap: + raise ValueError("You need to read the file first") + + property_type1 = self.full_sap.getElementsByTagName('PropertyType1') + property_type2 = self.full_sap.getElementsByTagName('PropertyType2') + position_of_flat = self.full_sap.getElementsByTagName('PositionOfFlat') + + if len(property_type1) != 1 or len(property_type2) != 1: + raise ValueError("Non-unique property tag found - investigate me") + + property_type1 = property_type1[0].firstChild.nodeValue + property_type2 = property_type2[0].firstChild.nodeValue + if position_of_flat[0].firstChild: + position_of_flat = POSITION_OF_FLAT[position_of_flat[0].firstChild.nodeValue] + else: + position_of_flat = None + + self.property_type = property_type1 + self.built_form = property_type2 + self.archetype = property_type1 + " - " + property_type2 + + if position_of_flat: + self.archetype = self.archetype + " " + position_of_flat + + def get_age_band(self): + if not self.full_sap: + raise ValueError("You need to read the file first") + + property_age_band = self.full_sap.getElementsByTagName('PropertyAgeBand') + + if len(property_age_band) != 1: + raise ValueError("Non-unique property age band tag found - investigate me") + + property_age_band = property_age_band[0].firstChild.nodeValue + self.age_band = PROPERTY_AGE_BAND[property_age_band] + + def get_wall_area_for_description(self, description): + wall_recs = self.full_sap.getElementsByTagName("WallRec") + for wall_rec in wall_recs: + desc_elements = wall_rec.getElementsByTagName("Description") + if desc_elements and desc_elements[0].firstChild.data == description: + area_elements = wall_rec.getElementsByTagName("Area") + if area_elements: + area = float(area_elements[0].firstChild.data) + # Placeholder for wall_description which you'll populate later + return f"Unheated corridor - {area} area" + return None + + def get_unheated_corridor(self): + """ + Unheated corridors don't always exist so we'll need to search for it + :return: + """ + + if not self.full_sap: + raise ValueError("You need to read the file first") + + self.unheated_corridor = self.get_wall_area_for_description("Flat corridor Main") + + def get_heating_1(self): + + if not self.full_sap: + raise ValueError("You need to read the file first") + + main_heating_system = self.full_sap.getElementsByTagName('MainHeatingSystem1') + + if len(main_heating_system) != 1: + raise ValueError("Non-unique main heating system tag found - investigate me") + + main_heating_system = main_heating_system[0] + + mhs = main_heating_system.getElementsByTagName('MHS')[0].firstChild.nodeValue + mhs = MAINHEATING_LOOKUP.get(mhs, mhs) + + fraction = main_heating_system.getElementsByTagName('Fraction')[0].firstChild.nodeValue + + self.heating1 = f"{mhs} : {fraction}% of heating" + + def get_ventilation(self): + + bool_lookup = { + "true": True, + "false": False + } + + # Extract MechanicalVentilationDecentralised + mech_vent = self.full_sap.getElementsByTagName("MechanicalVentilationDecentralised") + if mech_vent and mech_vent[0].childNodes: + mech_vent_value = mech_vent[0].firstChild.nodeValue + else: + mech_vent_value = None + + # Extract CrossVentilation + cross_vent = self.full_sap.getElementsByTagName("CrossVentilation") + if cross_vent and cross_vent[0].childNodes: + cross_vent_value = cross_vent[0].firstChild.nodeValue + cross_vent_value = bool_lookup.get(cross_vent_value, cross_vent_value) + else: + cross_vent_value = None + + # Extract NightVentilation + night_vent = self.full_sap.getElementsByTagName("NightVentilation") + if night_vent and night_vent[0].childNodes: + night_vent_value = night_vent[0].firstChild.nodeValue + night_vent_value = bool_lookup.get(night_vent_value, night_vent_value) + else: + night_vent_value = None + + # Create the outputs + self.mechanical_ventilation = "Mechanical ventilation present" if mech_vent_value else "No mechanical " \ + "ventilation" + self.cross_ventilation = "Cross ventilation present" if cross_vent_value else "No cross ventilation" + self.night_ventilation = "Night ventilation present" if night_vent_value else "No night ventilation" + + def get_floor_area(self): + + self.number_of_storeys = int(self.full_sap.getElementsByTagName('NumberOfStoreys')[0].firstChild.nodeValue) + storeys = self.full_sap.getElementsByTagName('StoreyMeasurementRec') + + # TODO: The first StoreyMeasurementRec tag looks like this in the examples we've seen: + # + # Indicating that the tag is explicitly indicated as empty + + storey_data = [] + storey_index = -1 + for storey in storeys: + storey_index += 1 + + if storey.getAttribute("xsi:nil") == "true": + continue + + if storey_index == -1: + raise NotImplementedError( + "Investigated me - potentially basement found but need to confirm with Basement tag" + ) + + floor_area = storey.getElementsByTagName('InternalFloorArea') + if not floor_area: + continue + + floor_area = float(floor_area[0].firstChild.nodeValue) + # If floor area is 0, skip this storey + if not floor_area: + continue + + perimeter = float(storey.getElementsByTagName('InternalPerimeter')[0].firstChild.nodeValue) + height = float(storey.getElementsByTagName('StoreyHeight')[0].firstChild.nodeValue) + + storey_data.append({ + "storey_index": storey_index, + "Floor Area": floor_area, + "Perimeter": perimeter, + "Height": height + }) + + # We will convert this into a table in the markdown + self.property_dimensions = storey_data + + def get_low_energy_lighting(self): + # Extract the values of the LightFittings and LELFittings tags + light_fittings = self.full_sap.getElementsByTagName('LightFittings')[0].firstChild.data + lel_fittings = self.full_sap.getElementsByTagName('LELFittings')[0].firstChild.data + + # Construct the string message + self.low_energy_lighting = f"{lel_fittings} out of {light_fittings} lighting fittings are low energy." + + def get_cylinder(self): + insulation_type = self.full_sap.getElementsByTagName('InsulationType')[0].firstChild.data + insulation_thickness = self.full_sap.getElementsByTagName('InsulationThickness')[0].firstChild.data + + if insulation_type and insulation_thickness: + self.cylinder = f"Insulated, {insulation_type}: {insulation_thickness}mm." + else: + self.cylinder = "Not insulated." + + self.cylinder_stat = self.full_sap.getElementsByTagName('CylinderStat')[0].firstChild.data