diff --git a/.gitignore b/.gitignore
index 63884ad7..5e247d77 100644
--- a/.gitignore
+++ b/.gitignore
@@ -268,4 +268,11 @@ adhoc
adhoc/*
etl-router-venv/
-refactor_datasets/
\ No newline at end of file
+refactor_datasets/
+
+etl/eligibility/ha_15_32/
+cache/
+*/.idea
+
+*.png
+*.pptx
\ No newline at end of file
diff --git a/.idea/terraform.xml b/.idea/terraform.xml
new file mode 100644
index 00000000..cd46a3d3
--- /dev/null
+++ b/.idea/terraform.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
new file mode 100644
index 00000000..af5a3faf
--- /dev/null
+++ b/asset_list/AssetList.py
@@ -0,0 +1,2436 @@
+import hashlib
+import os
+import re
+import tiktoken
+from pprint import pprint
+from datetime import datetime
+
+from openai import OpenAI
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from fuzzywuzzy import process
+from utils.logger import setup_logger
+from backend.SearchEpc import SearchEpc
+from BaseUtility import Definitions
+import asset_list.mappings.property_type as property_type_mappings
+import asset_list.mappings.walls as walls_mappings
+import asset_list.mappings.heating_systems as heating_mappings
+import asset_list.mappings.exising_pv as existing_pv_mappings
+import asset_list.mappings.built_form as built_form_mappings
+import asset_list.mappings.roof as roof_mappings
+
+from recommendations.recommendation_utils import (
+ estimate_perimeter,
+ estimate_external_wall_area,
+ estimate_number_of_floors
+)
+
+from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
+
+logger = setup_logger()
+
+# OpenAI API Key (set this in your environment variables for security)
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+
+
+class DataRemapper:
+ def __init__(self, standard_values, standard_map=None, max_tokens=1000):
+ """
+ Initialize the remapper with standard values and a predefined mapping.
+
+ :param standard_values: Set of allowed standardized values.
+ :param standard_map: Dictionary of common remappings {raw_value: standard_value}.
+ """
+ self.standard_values = standard_values
+ self.standard_map = standard_map
+ self.fuzzy_threshold = 90 # Adjust fuzzy matching sensitivity
+ self.ai_model = "gpt-4-turbo" # Use gpt-3.5-turbo for cheaper processing
+
+ # Tokenizer for counting tokens
+ self.tokenizer = tiktoken.encoding_for_model(self.ai_model)
+
+ # Track token usage and remap dictionary
+ self.total_tokens_used = 0
+ self.total_cost = 0
+ self.remap_dict = {} # {original_value: standardized_value}
+ self.max_tokens = max_tokens # Limit for OpenAI API
+
+ # Memoization for AI calls
+ self.ai_cache = {} # {tuple(unmapped_values): {original_value: standardized_value}}
+ # Capture the reponse for debugging
+ self.ai_response = None
+
+ # OpenAI pricing (as of Feb 2024)
+ self.pricing = {
+ "gpt-4-turbo": {"input": 0.01 / 1000, "output": 0.03 / 1000},
+ "gpt-3.5-turbo": {"input": 0.0015 / 1000, "output": 0.002 / 1000},
+ }
+
+ self.openai_client = OpenAI(api_key=OPENAI_API_KEY)
+
+ @staticmethod
+ def clean_string(text):
+ """Basic text cleaning: remove extra spaces, punctuation, and normalize case."""
+ if not isinstance(text, str):
+ return None
+ text = text.strip().lower()
+ text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
+ # Replace double strings
+ text = re.sub(r'\s+', ' ', text)
+ return text
+
+ def fuzzy_match(self, text):
+ """Use fuzzy matching to find the closest standard value."""
+ match, score = process.extractOne(text, self.standard_values) if text else (None, 0)
+ return match if score >= self.fuzzy_threshold else None
+
+ def count_tokens(self, text):
+ """Estimate the number of tokens in a given text."""
+ return len(self.tokenizer.encode(text)) if text else 0
+
+ def ai_standardize(self, unmapped_values):
+ """Call OpenAI API **once** for all unmapped values to minimize cost, with memoization."""
+ if not unmapped_values:
+ return {}
+
+ unmapped_tuple = tuple(sorted(unmapped_values)) # Ensure consistency for memoization
+ if unmapped_tuple in self.ai_cache:
+ return self.ai_cache[unmapped_tuple] # Return memoized result
+
+ prompt = f"""
+ You are an expert in data classification. Standardize each of these values into one of the categories:
+ {list(self.standard_values)}.
+
+ Return only a JSON dictionary where:
+ - The keys are the original values.
+ - The values are the standardized ones.
+
+ Strictly return JSON **without markdown formatting** or extra text.
+
+ Example Output:
+ {{
+ "BLKHOUS": "block house",
+ "BEDSIT": "bedsit"
+ }}
+
+ Values to standardize:
+ {unmapped_values}
+ """
+
+ # Count input tokens
+ input_tokens = self.count_tokens(prompt)
+ if input_tokens > self.max_tokens:
+ raise ValueError("Input tokens exceed the maximum limit.")
+
+ logger.info("Calling OpenAI API for standardization...")
+ response = self.openai_client.chat.completions.create(
+ model=self.ai_model,
+ messages=[{"role": "user", "content": prompt}],
+ max_tokens=self.max_tokens,
+ temperature=0.1,
+ )
+
+ output_text = response.choices[0].message.content.strip()
+ output_tokens = self.count_tokens(output_text) # Count output tokens
+
+ # Track total token usage
+ self.total_tokens_used += input_tokens + output_tokens
+
+ # Estimate cost
+ input_cost = input_tokens * self.pricing[self.ai_model]["input"]
+ output_cost = output_tokens * self.pricing[self.ai_model]["output"]
+ self.total_cost += input_cost + output_cost
+
+ try:
+ # Parse response as dictionary
+ mapping = eval(output_text) # OpenAI should return a valid dictionary
+ except:
+ mapping = {val: "unknown" for val in unmapped_values} # Fallback
+
+ # Memoize the AI response
+ self.ai_cache[unmapped_tuple] = mapping
+ # We store the raw AI response for debugging
+ logger.debug(f"AI Response: {mapping}")
+ self.ai_response = output_text
+
+ return mapping
+
+ def standardize_list(self, values_to_remap):
+ """
+ Standardizes a list of values and returns a dictionary {original_value: standardized_value}.
+
+ :param values_to_remap: List of raw values to standardize.
+ :return: Dictionary {original_value: standardized_value}.
+ """
+ unique_values = set(values_to_remap) # Process only unique values
+
+ unmapped_values = []
+ for value in unique_values:
+ if pd.isna(value): # Handle NaN values
+ self.remap_dict[value] = "unknown"
+ continue
+
+ cleaned_value = self.clean_string(value)
+
+ # Rule-Based Check (Predefined Mapping)
+ if cleaned_value in self.standard_map or value in self.standard_map:
+ self.remap_dict[value] = (
+ self.standard_map[cleaned_value] if cleaned_value in self.standard_map else self.standard_map[value]
+ )
+ continue
+
+ if value.lower() in self.standard_map:
+ self.remap_dict[value] = self.standard_map[value.lower()]
+ continue
+
+ # Exact Match in Standard Values
+ if cleaned_value in self.standard_values:
+ self.remap_dict[value] = cleaned_value
+ continue
+
+ # Fuzzy Matching
+ fuzzy_match = self.fuzzy_match(cleaned_value)
+ if fuzzy_match:
+ self.remap_dict[value] = fuzzy_match
+ continue
+
+ # Capture anything that wasn't mapped
+ unmapped_values.append(value)
+
+ # AI Model - remap anything unmapped (batch request)
+ ai_mapping = self.ai_standardize(unmapped_values)
+ self.remap_dict.update(ai_mapping)
+
+ return self.remap_dict
+
+ def report_usage(self):
+ """Prints a summary of token usage and cost."""
+ print(f"\n🔹 Total Tokens Used: {self.total_tokens_used}")
+ print(f"💰 Estimated Cost: ${self.total_cost:.4f}")
+
+
+class AssetList:
+ """
+ This class is used to standardise asset lists so that we can process the core information in a consistent manner.
+ """
+
+ EPC_API_DATA_NAMES = {
+ "uprn": "epc_os_uprn",
+ "address1": "epc_address1",
+ "address": "epc_address",
+ "postcode": "epc_postcode",
+ "inspection-date": "epc_inspection_date",
+ "current-energy-efficiency": "epc_sap_score_on_register",
+ "current-energy-rating": "epc_rating_on_register",
+ "property-type": "epc_property_type",
+ "built-form": "epc_archetype",
+ "total-floor-area": "epc_total_floor_area",
+ "construction-age-band": "epc_age_band",
+ "floor-height": "epc_floor_height",
+ "number-habitable-rooms": "epc_number_habitable_rooms",
+ "walls-description": "epc_wall_construction",
+ "roof-description": "epc_roof_construction",
+ "floor-description": "epc_floor_construction",
+ "mainheat-description": "epc_heating_type",
+ 'mainheatcont-description': "epc_heating_controls",
+ "secondheat-description": "epc_secondary_heating",
+ "transaction-type": "epc_reason",
+ "energy-consumption-current": "epc_heat_demand",
+ "photo-supply": "epc_photo_supply",
+ "estimated": "estimated"
+ }
+ FIND_EPC_DATA_NAMES = {
+ "heating_text": "epc_estiamted_heating_kwh",
+ "hot_water_text": "epc_estimated_hotwater_kwh",
+ 'Assessor’s name': "epc_assessor_name",
+ "Assessor's Telephone": "epc_assessor_telephone",
+ "Assessor's Email": "epc_assessor_email",
+ "Accreditation scheme": "epc_assessor_accreditation",
+ "Assessor’s ID": "epc_assessor_id",
+ "Solar photovoltaics": "epc_solar_pv"
+ }
+
+ DATETIME_REMAP = {
+ "Pre 1900": datetime(year=1899, month=12, day=31),
+ }
+
+ # These are the accepted methods we have for cleaning the address1 column
+ ADDRESS_1_CLEANING_METHODS = [
+ "first_two_words", # This method will split on the fist two words, where the separator is a space
+ "first_word", # This method will split on the first word, where the separator is a space
+ "house_number_extraction", # This method will use the NLP model in SearchEPC to extract the housenumber
+ # "address1_extraction" # This method will use the NLP model to extract address1
+ ]
+
+ # Standard column Names
+ STANDARD_ADDRESS_1 = "domna_address_1"
+ STANDARD_POSTCODE = "domna_postcode"
+ STANDARD_FULL_ADDRESS = "domna_full_address"
+ STANDARD_YEAR_BUILT = "landlord_year_built"
+ STANDARD_UPRN = "ordnance_survey_uprn"
+ STANDARD_LANDLORD_PROPERTY_ID = "landlord_property_id"
+ STANDARD_PROPERTY_TYPE = "landlord_property_type"
+ STANDARD_BUILT_FORM = "landlord_built_form"
+ STANDARD_WALL_CONSTRUCTION = "landlord_wall_construction"
+ STANDARD_ROOF_CONSTRUCTION = "landlord_roof_construction"
+ STANDARD_HEATING_SYSTEM = "landlord_heating_system"
+ STANDARD_EXISTING_PV = "landlord_existing_pv"
+ STANDARD_SAP = "landlord_sap_rating"
+
+ DOMNA_PROPERTY_ID = "domna_property_id"
+
+ # Regular expression for identifying if the address might point to multiple units
+ MULTI_UNIT_REGEX = re.compile(r'\b([A-Za-z0-9]+)-([A-Za-z0-9]+)\b')
+
+ # List of columns relating to the non-intrusive data
+ NON_INTRUSIVES_COLNAMES = [
+ "Archetype", "Construction", "Insulated", "Material", "CIGA Check Required",
+ "PV, ACCESS ISSUE, SEE NOTES", "OFF GAS - ROOF ORIENTATION",
+ "Any further surveyor notes", 'Surveyors Name'
+ ]
+
+ NON_INTRUSIVES_ELIGIBILITY_COLUMN = "Eligibility (Red/Yellow/Green)"
+
+ OLD_FORMAT_NON_INTRUSIVE_COLNAMES = ['WFT Findings', 'ECO Eligibility']
+
+ # This SAP threshold is a key search criteria for properties that may be eligible for extraction
+ FILLED_CAVITY_SAP_THRESHOLD = 75
+ # This SAP the
+ EMPTY_CAVITY_SAP_THRESHOLD = 75
+ # Any EPC deemed to have been conducted prior to this year is deemed to be unreliable
+ EPC_YEAR_THRESHOLD = pd.Timestamp.now().year - 5
+
+ # Properties before this year are more likely to have lower EPC ratings and more likely to qualify
+ EMPTY_CAVITY_YEAR_THRESHOLD = 2002
+
+ # Attributes - these are columns that we produce, calcualted based on other pieces of data
+ ATTRIBUTE_HAS_SOLAR = "attribute_has_solar"
+ ATTRIBUTE_NUMBER_OF_FLOORS = "attribute_est_number_floors"
+ ATTRIBUTE_ESTIMATED_PERIMETER = "attribute_est_perimter"
+ ATTRIBUTE_HEAT_LOSS_AREA = "attribute_heat_loss_area"
+ ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS = "attribute_epc_roof_insulation_thickness"
+ ATTRIBUTE_SAP_THRESHOLD_AND_BELOW = f"sap_rating_{FILLED_CAVITY_SAP_THRESHOLD}_and_below"
+ ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD = f"epc_is_pre_{EPC_YEAR_THRESHOLD}"
+
+ # These are the descriptions that we look for in the EPC data that are indicative of no insulation
+ EPC_NO_WALL_INSULATION_DESCRIPTIONS = [
+ "cavity wall, as built, no insulation (assumed)",
+ "cavity wall, as built, partial insulation (assumed)",
+ "cavity wall, as built, partial insulation",
+ "cavity wall, as built, no insulation",
+ ]
+
+ # List of strings that we look for in the EPC data, where substrings indicate that the wall is insulated
+ EPC_INSULATED_WALLS_SUBSTRINGS = [
+ ", insulated", "with external insulation", "with internal insulation", "filled cavity"
+ ]
+
+ # List of strings that we look for in the EPC data, where substrings indicate that the roof is insulated
+ EPC_INSULATED_ROOF_SUBSTRINGS = [
+ "(another dwelling above)", ", insulated", ", insulated (assumed) ",
+ ", ceiling insulated",
+ ]
+
+ # List of strings we look for in the EPC data, where substrings indicate that the cavity is empty
+ UNINSULATED_CAVITY_SUBSTRINGS = [
+ "cavity wall, as built, no insulation (assumed)",
+ "cavity wall, as built, no insulation",
+ "cavity wall, as built, partial insulation (assumed)",
+ "cavity wall, as built, partial insulation",
+ ]
+
+ def __init__(
+ self,
+ local_filepath,
+ sheet_name,
+ address1_colname,
+ postcode_colname,
+ full_address_colname,
+ landlord_property_id=None,
+ full_address_cols_to_concat=None,
+ missing_postcodes_method=None,
+ address1_extraction_method=None,
+ landlord_year_built=None,
+ landlord_uprn=None,
+ landlord_property_type=None,
+ landlord_built_form=None,
+ landlord_wall_construction=None,
+ landlord_roof_construction=None,
+ landlord_heating_system=None,
+ landlord_existing_pv=None,
+ landlord_sap=None,
+ phase=False,
+ header=0
+ ):
+ self.local_filepath = local_filepath
+ self.sheet_name = sheet_name
+ # Read in the data
+ if local_filepath.endswith(".xlsx"):
+ self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name)
+ else:
+ self.raw_asset_list = pd.read_csv(local_filepath)
+ self.standardised_asset_list = self.raw_asset_list.copy()
+ # Will be used to store aggregated figures against the various work types
+ self.work_type_figures = {}
+ self.flat_data = None
+ self.duplicated_addresses = None
+ self.contact_details = None
+ self.contact_detail_fields = None
+ self.outcomes = None
+ self.outcomes_no_match = pd.DataFrame()
+ self.outcomes_for_output = pd.DataFrame()
+ self.master_surveyed = None
+ self.unmatched_submissions = pd.DataFrame()
+
+ # When this is True, we intend to break the programme into multiple phases. We may need to review
+ # how this is structured in the future, as depending on how we get future data, we may need to
+ # remove some existing phases from the reporting, or specifically highlight the phase (1 to n-1)
+ # properties, assuming the current phase is n.
+ self.phase = phase
+
+ # We detect the presence of the non-intrusive columns
+ self.non_intrusives_present = "CIGA Check Required" in self.raw_asset_list.columns
+ # We detect if we have the old format of non-intruvies
+ self.old_format_non_intrusives_present = "WFT Findings" in self.raw_asset_list.columns
+
+ self.non_intrusives_eligibility = "Eligibility (Red/Yellow/Green)" in self.raw_asset_list.columns
+
+ # Names of columns
+ self.landlord_property_id = landlord_property_id
+ self.address1_colname = address1_colname
+ self.postcode_colname = postcode_colname
+ self.full_address_colname = full_address_colname
+ self.landlord_year_built = landlord_year_built
+ self.landlord_uprn = landlord_uprn
+ self.landlord_property_type = landlord_property_type
+ self.landlord_built_form = landlord_built_form
+ self.landlord_wall_construction = landlord_wall_construction
+ self.landlord_roof_construction = landlord_roof_construction
+ self.landlord_heating_system = landlord_heating_system
+ self.landlord_existing_pv = landlord_existing_pv
+ self.landlord_sap = landlord_sap
+
+ # parameters for cleaning
+ self.full_address_cols_to_concat = full_address_cols_to_concat
+ self.missing_postcodes_method = missing_postcodes_method
+ self.address1_extraction_method = address1_extraction_method
+
+ self.debug_information = {
+ "property_type": None,
+ "wall_construction": None,
+ "heating_system": None,
+ "existing_pv": None
+ }
+
+ self.variable_mappings = {}
+ self.hubspot_data = None
+
+ self.rename_map = {}
+ self.keep_variables = []
+
+ # Finally, we handle the case where the landlord's property ID is actually the OS UPRN
+ if (self.landlord_uprn == self.landlord_property_id) and (self.landlord_property_id is not None):
+ self.standardised_asset_list[self.STANDARD_UPRN] = self.standardised_asset_list[self.landlord_uprn].copy()
+ # Update the reference to landlord UPRn
+ self.landlord_uprn = self.STANDARD_UPRN
+
+ # Handle the case when full address and address 1 are the same
+ if self.full_address_colname == self.address1_colname:
+ self.full_address_colname = self.STANDARD_FULL_ADDRESS
+ self.standardised_asset_list[self.full_address_colname] = (
+ self.standardised_asset_list[self.address1_colname].copy()
+ )
+
+ # Handle the case where the property type column is the same as the built type
+ if self.landlord_property_type == self.landlord_built_form:
+ self.landlord_built_form = self.STANDARD_BUILT_FORM
+ self.standardised_asset_list[self.landlord_built_form] = (
+ self.standardised_asset_list[self.landlord_property_type].copy()
+ )
+
+ # If landlord built form is None (which it often is) we use the built for from inspections
+ if (self.landlord_built_form is None) and self.non_intrusives_present:
+ self.landlord_built_form = self.STANDARD_BUILT_FORM
+ self.standardised_asset_list[self.landlord_built_form] = (
+ self.standardised_asset_list["Archetype"].copy()
+ )
+
+ def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"):
+
+ if method not in self.ADDRESS_1_CLEANING_METHODS:
+ raise ValueError(f"Method {method} for producing address1 not recognized")
+
+ if method == "first_two_words":
+ asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
+ return asset_list
+
+ if method == "first_word":
+ asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[0]
+ return asset_list
+
+ if method == "house_number_extraction":
+ asset_list[self.address1_colname] = asset_list.apply(
+ lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]),
+ axis=1
+ )
+
+ for _, x in asset_list.iterrows():
+ SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col])
+ return asset_list
+
+ raise ValueError(f"Method {method} not recognized")
+
+ @staticmethod
+ def _address1_extraction(x):
+ pass
+
+ def create_property_id(self):
+ """
+ This function creates the domna property ID, which is simply a hash of the full address and postcode
+ We want all figures to be positive
+ :return:
+ """
+
+ # We'll remove punctuation and whitespace from the address, before hashing to produce an ID
+
+ def _make_hash(value):
+ """Generates a stable SHA256 hash suffix and appends it to a cleaned version of the value."""
+ # Normalize and remove special characters for cleaner ID
+ cleaned_value = re.sub(r"[^\w\s-]", "", value).replace(" ", "_").lower()
+
+ # Generate SHA-256 hash and truncate it
+ short_hash = hashlib.sha256(value.encode()).hexdigest()[:12]
+
+ return f"{cleaned_value}-{short_hash}"
+
+ # Apply transformation
+ self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = (
+ self.standardised_asset_list[self.full_address_colname] +
+ self.standardised_asset_list[self.postcode_colname]
+ ).str.strip().str.replace(r"[^\w\s]", "", regex=True).str.replace(" ", "").str.lower().apply(_make_hash)
+
+ @staticmethod
+ def _strip_postcode_from_full_address(full_address, postcode):
+ cleaned = full_address.replace(postcode, "")
+ # Remove any trailing commas and spaces
+ cleaned = cleaned.rstrip(", ").strip(",").strip()
+ return cleaned
+
+ @classmethod
+ def _identify_multi_address(cls, address):
+ # We check if the address is comma separated
+ if "," in address:
+ address1_section = address.split(",")[0]
+ # We look for string in the form (x-y)
+ return bool(cls.MULTI_UNIT_REGEX.search(address1_section))
+
+ @staticmethod
+ def _convert_uprn(x):
+ """
+ Used to convert UPRNS to integer strings
+ :param x: uprn to convert
+ :return: converted uprn
+ """
+
+ if pd.isnull(x):
+ return x
+
+ # check if numeric
+ if np.isreal(x):
+ return str(int(x))
+
+ if str(x).isdigit():
+ return str(int(x))
+ return x
+
+ @staticmethod
+ def _clean_postcode(postcode):
+ # Remove double spaces
+ postcode = postcode.replace(" ", " ")
+ if " " not in postcode:
+ # Restructure it
+ return " ".join(
+ [postcode[:-3], postcode[-3:]]
+ )
+
+ return postcode
+
+ def init_standardise(self):
+ """
+ This function is used to standardise the asset list
+ :return: standardised asset list
+ """
+
+ # Remove rows without a postcode
+ if self.postcode_colname is not None:
+ self.standardised_asset_list = self.standardised_asset_list.dropna(subset=[self.postcode_colname])
+ # We also clean postcode columns where if there is not space, we create one
+ self.standardised_asset_list[self.postcode_colname] = self.standardised_asset_list[
+ self.postcode_colname
+ ].apply(self._clean_postcode)
+
+ # We clean up portential non-breaking spaces, and double spaces
+ for col in [
+ c for c in [self.postcode_colname, self.full_address_colname, self.address1_colname] if
+ c is not None
+ ]:
+ self.standardised_asset_list[col] = self.standardised_asset_list[col].astype(str)
+ self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace('\xa0', ' ', regex=False)
+ self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace(' ', ' ', regex=False)
+
+ if self.address1_colname is None:
+ if self.address1_extraction_method is None:
+ raise ValueError("Missing address 1 - please specify an extraction method")
+ self.address1_colname = self.STANDARD_ADDRESS_1
+ # If we do not have this, we produce it
+ self.standardised_asset_list = self._extract_address1(
+ asset_list=self.standardised_asset_list,
+ full_address_col=self.full_address_colname,
+ postcode_col=self.postcode_colname,
+ method=self.address1_extraction_method
+ )
+
+ if self.full_address_colname is None:
+ if not self.full_address_cols_to_concat:
+ raise ValueError("Missing full address - please specify columns to concatenate")
+ self.full_address_colname = self.STANDARD_FULL_ADDRESS
+ self.standardised_asset_list[self.full_address_colname] = (
+ self.standardised_asset_list[self.full_address_cols_to_concat].apply(
+ lambda x: ", ".join([y for y in x if not pd.isnull(y)]),
+ axis=1
+ )
+ )
+ else:
+
+ # Make sure to strip the postcode out of the full address
+ self.standardised_asset_list[self.full_address_colname] = self.standardised_asset_list.apply(
+ lambda x: self._strip_postcode_from_full_address(
+ full_address=x[self.full_address_colname],
+ postcode=x[self.postcode_colname]
+ ),
+ axis=1
+ )
+
+ # We create the domna property id
+ self.create_property_id()
+
+ # Clean up the UPRN column, if the landlord has provided them
+ if self.landlord_uprn is not None:
+ self.standardised_asset_list[self.landlord_uprn] = (
+ self.standardised_asset_list[self.landlord_uprn].apply(self._convert_uprn)
+ )
+
+ # We keep just the columns we care about and will work through the various columns and standardise
+ variables = [
+ self.landlord_property_id,
+ self.DOMNA_PROPERTY_ID,
+ self.address1_colname,
+ self.postcode_colname,
+ self.full_address_colname,
+ self.landlord_uprn,
+ self.landlord_property_type,
+ self.landlord_built_form,
+ self.landlord_year_built,
+ self.landlord_wall_construction,
+ self.landlord_roof_construction,
+ self.landlord_heating_system,
+ self.landlord_existing_pv,
+ self.landlord_sap,
+ ]
+ # Keep just non-null variables (e.g landlord may not provide uprn
+ self.keep_variables = [v for v in variables if v is not None]
+ self.rename_map = {
+ self.landlord_property_id: self.STANDARD_LANDLORD_PROPERTY_ID,
+ self.address1_colname: self.STANDARD_ADDRESS_1,
+ self.postcode_colname: self.STANDARD_POSTCODE,
+ self.full_address_colname: self.STANDARD_FULL_ADDRESS,
+ self.landlord_uprn: self.STANDARD_UPRN,
+ self.landlord_property_type: self.STANDARD_PROPERTY_TYPE,
+ self.landlord_built_form: self.STANDARD_BUILT_FORM,
+ self.landlord_year_built: self.STANDARD_YEAR_BUILT,
+ self.landlord_wall_construction: self.STANDARD_WALL_CONSTRUCTION,
+ self.landlord_roof_construction: self.STANDARD_ROOF_CONSTRUCTION,
+ self.landlord_heating_system: self.STANDARD_HEATING_SYSTEM,
+ self.landlord_existing_pv: self.STANDARD_EXISTING_PV,
+ self.landlord_sap: self.STANDARD_SAP,
+ }
+ self.rename_map = {k: v for k, v in self.rename_map.items() if k is not None}
+
+ non_intrusive_columns = []
+ if self.non_intrusives_present:
+ non_intrusive_columns = self.NON_INTRUSIVES_COLNAMES
+
+ if self.non_intrusives_eligibility:
+ non_intrusive_columns.append(self.NON_INTRUSIVES_ELIGIBILITY_COLUMN)
+
+ if self.old_format_non_intrusives_present:
+ # We check if we have the ECO Eligibility column, which we might not have
+ non_intrusive_columns = [
+ c for c in self.OLD_FORMAT_NON_INTRUSIVE_COLNAMES if c in self.standardised_asset_list.columns
+ ]
+
+ self.keep_variables += non_intrusive_columns
+
+ self.rename_map = {
+ **self.rename_map,
+ **dict(
+ zip(non_intrusive_columns, ["non-intrusives: " + c for c in non_intrusive_columns])
+ )
+ }
+
+ # We idenfiy addresses which are likely to be multi-addresses (i.g are rooms x-y)
+ self.standardised_asset_list["is_multi_address"] = self.standardised_asset_list[
+ self.full_address_colname
+ ].apply(lambda x: self._identify_multi_address(x))
+
+ # We handle cleaning for walls, in the instance that the landlord provides us with EPC data and
+ # we see instances of "average thermal transmittance" in the description
+ if self.landlord_wall_construction is not None:
+ self.standardised_asset_list[self.landlord_wall_construction] = np.where(
+ self.standardised_asset_list[self.landlord_wall_construction].str.lower().str.contains(
+ "average thermal transmittance"
+ ) == True,
+ "new build - average thermal transmittance",
+ self.standardised_asset_list[self.landlord_wall_construction]
+ )
+ else:
+ # We want to make sure that we have a column for wall construction
+ self.landlord_wall_construction = self.STANDARD_WALL_CONSTRUCTION
+ self.standardised_asset_list[self.landlord_wall_construction] = None
+
+ if self.landlord_roof_construction is None:
+ self.landlord_roof_construction = self.STANDARD_ROOF_CONSTRUCTION
+ self.standardised_asset_list[self.landlord_roof_construction] = None
+
+ # Clear our build year column
+ # We attempt to process the year built column
+ if self.landlord_year_built is not None:
+ # We check if we have a datetime - year built has not been renamed
+ if isinstance(self.standardised_asset_list[self.landlord_year_built].iloc[0], datetime):
+ # We treat any string columns - with common values we see
+ self.standardised_asset_list[self.landlord_year_built] = (
+ self.standardised_asset_list[self.landlord_year_built].replace(self.DATETIME_REMAP)
+ )
+
+ self.standardised_asset_list[self.landlord_year_built] = pd.to_datetime(
+ self.standardised_asset_list[self.landlord_year_built]
+ )
+ # Convert this to year
+ self.standardised_asset_list[self.landlord_year_built] = (
+ self.standardised_asset_list[self.landlord_year_built].dt.year
+ )
+ else:
+ # We attempt to convert the year built to a datetime, by detecting the format and converting
+
+ def extract_year(date_str):
+ """
+ Extracts the year from a date string in the format '01-Jul-YYYY'.
+ Returns the extracted year as an integer or None if the format is incorrect.
+ """
+ known_errors = [
+ "#MULTIVALUE",
+ "This cell has an external reference that can't be shown or edited. Editing this cell will "
+ "remove the external reference.",
+ "ND",
+ 'PIMSS EMPTY'
+ ]
+
+ if pd.isnull(date_str) or date_str in known_errors or (date_str == 0):
+ return None
+
+ if isinstance(date_str, str):
+ match = re.match(r"\d{1,2}-[A-Za-z]{3}-(\d{4})", date_str)
+ if match:
+ return int(match.group(1)) # Extract the year and convert to integer
+ if "-" in date_str:
+
+ # Count the number of times we have "-", as we've seen double ranges
+ # (when we have extensions) so the format is like this:
+ # 'G: 1983-1990, H: 1991-1995'
+ if date_str.count("-") == 2:
+ # We have a range
+ return int(date_str.split("-")[1].split(",")[0])
+ # We probably have a range
+ return int(date_str.split("-")[1].strip())
+
+ if isinstance(date_str, datetime):
+ return date_str.year
+
+ if isinstance(date_str, float):
+ if str(int(date_str)).isdigit() & (len(str(int(date_str))) == 4):
+ return int(date_str)
+
+ # Check if date_str is a year itself
+ if str(date_str).isdigit() & (len(str(date_str)) == 4):
+ return int(date_str)
+
+ # Remove any non-numeric characters
+ date_str = re.sub(r"\D", "", str(date_str))
+ if str(date_str).isdigit() & (len(str(date_str)) == 4):
+ return int(date_str)
+
+ raise NotImplementedError(f"Unhandled format for year built, value is {date_str} - implement me")
+
+ self.standardised_asset_list[self.landlord_year_built] = self.standardised_asset_list[
+ self.landlord_year_built
+ ].apply(extract_year)
+
+ # We now create standard lookups
+ to_remap = {
+ self.landlord_property_type: {
+ "standard_values": property_type_mappings.STANDARD_PROPERTY_TYPES,
+ "standard_map": property_type_mappings.PROPERTY_MAPPING
+ },
+ self.landlord_built_form: {
+ "standard_values": built_form_mappings.STANDARD_BUILT_FORMS,
+ "standard_map": built_form_mappings.BUILT_FORM_MAPPINGS
+ },
+ self.landlord_wall_construction: {
+ "standard_values": walls_mappings.STANDARD_WALL_CONSTRUCTIONS,
+ "standard_map": walls_mappings.WALL_CONSTRUCTION_MAPPINGS
+ },
+ self.landlord_heating_system: {
+ "standard_values": heating_mappings.STANDARD_HEATING_SYSTEMS,
+ "standard_map": heating_mappings.HEATING_MAPPINGS
+ },
+ self.landlord_existing_pv: {
+ "standard_values": existing_pv_mappings.STANDARD_EXISTING_PV,
+ "standard_map": existing_pv_mappings.EXISTING_PV_MAPPINGS
+ },
+ self.landlord_roof_construction: {
+ "standard_values": roof_mappings.STANDARD_ROOF_CONSTRUCTIONS,
+ "standard_map": roof_mappings.ROOF_CONSTRUCTION_MAPPINGS
+ }
+ }
+ # Keep just entries where the key is not None
+ to_remap = {k: v for k, v in to_remap.items() if k is not None}
+
+ for variable, config in to_remap.items():
+ logger.info("Standardising variable: %s", variable)
+ # Strip each of these columns
+ self.standardised_asset_list[variable] = self.standardised_asset_list[variable].str.strip()
+ values_to_remap = self.standardised_asset_list[variable].unique()
+ # We want to map this to our standardised list of property types we're interested in
+ remapper = DataRemapper(standard_values=config["standard_values"], standard_map=config["standard_map"])
+ remap_dictionary = remapper.standardize_list(values_to_remap=values_to_remap.tolist())
+ self.variable_mappings[variable] = remap_dictionary
+
+ # We now print out the variable mappings, which can be reviewed by the user, before the final standardised
+ # asset list is returned
+ for variable, mapping in self.variable_mappings.items():
+ pprint(f"Variable: {variable}")
+ pprint(mapping)
+ # Print a space
+ print("\n")
+ pprint("=======================================")
+
+ def apply_standardiation(self, override_empty_mappings=False):
+ """
+ This function applies the standardisation to the asset list
+ :param override_empty_mappings: If true, will override the check for empty mappings. This is only relevant
+ if there are no categories which need remapping which is highly unlikely
+ :return:
+ """
+
+ if self.phase:
+ # We filter on just the properties that have had an inspection
+ self.standardised_asset_list = self.standardised_asset_list[
+ ~self.standardised_asset_list['Surveyors Name'].isin(["YET TO BE SURVEYED"])
+ ]
+
+ if not self.variable_mappings and not override_empty_mappings:
+ raise ValueError("Please run init_standardise first")
+
+ logger.info("Applying standardisation to asset list")
+
+ for variable, mapping in self.variable_mappings.items():
+ self.standardised_asset_list[variable + "_original_from_landlord"] = (
+ self.standardised_asset_list[variable].copy()
+ )
+ self.standardised_asset_list[variable] = self.standardised_asset_list[variable].map(mapping)
+
+ if self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum():
+ # Drop the dupes
+ pprint(
+ f"There are {self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum()} duplicated "
+ f"addresses - dropping"
+ )
+
+ # Keep a record of duplicates
+ self.duplicated_addresses = self.standardised_asset_list[
+ self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated()
+ ][[self.DOMNA_PROPERTY_ID, self.address1_colname, self.postcode_colname]].copy()
+
+ self.standardised_asset_list = self.standardised_asset_list[
+ ~self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated()
+ ]
+
+ # Apply renames to our standard names
+ # Perform final variable selection and renaming:
+
+ # We add the original columns to the keep variables
+ self.keep_variables += [
+ k + "_original_from_landlord" for k in self.variable_mappings.keys()
+ ]
+
+ self.standardised_asset_list = self.standardised_asset_list[self.keep_variables].rename(
+ columns=self.rename_map
+ )
+
+ # We fill any standard columns that are not in the data because they were not provided by the landlord
+ missing_variables = [
+ v for v in [
+ self.STANDARD_EXISTING_PV,
+ self.STANDARD_HEATING_SYSTEM,
+ self.STANDARD_UPRN,
+ self.STANDARD_PROPERTY_TYPE,
+ self.STANDARD_YEAR_BUILT,
+ self.STANDARD_WALL_CONSTRUCTION,
+ self.STANDARD_HEATING_SYSTEM,
+ self.STANDARD_EXISTING_PV
+ ] if v not in self.standardised_asset_list.columns
+ ]
+ for v in missing_variables:
+ self.standardised_asset_list[v] = None
+
+ # Convert to string
+ self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID] = (
+ self.standardised_asset_list[self.STANDARD_LANDLORD_PROPERTY_ID].astype(str)
+ )
+
+ def merge_data(self, df: pd.DataFrame):
+ """
+ Used to insert data into the standardised asset list, based on the domna property id
+ :return:
+ """
+ if self.DOMNA_PROPERTY_ID not in df.columns:
+ raise ValueError(f"Dataframe must contain the column {self.DOMNA_PROPERTY_ID}")
+
+ if df[self.DOMNA_PROPERTY_ID].duplicated().sum():
+ raise ValueError(f"{self.DOMNA_PROPERTY_ID} contains duplicated IDs")
+
+ self.standardised_asset_list = self.standardised_asset_list.merge(
+ df, how="left", on=self.DOMNA_PROPERTY_ID
+ )
+
+ def extract_attributes(self, pull_epc=True):
+ # Used to extracty the typical attributes that we use to identify viable work
+
+ self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR] = (
+ self.standardised_asset_list[self.FIND_EPC_DATA_NAMES["Solar photovoltaics"]] |
+ ~self.standardised_asset_list[self.EPC_API_DATA_NAMES["photo-supply"]].isin(["0.0", 0, None, "", np.nan])
+ )
+
+ accepted_epc_property_types = ["House", "Flat", "Bungalow", "Maisonette"]
+
+ # The logic here is:
+ # 1) Take the property type provided by the HA themselves
+ # 2) In absence of that, take the EPC property type
+ # 3) Otherwise use None
+ self.standardised_asset_list[self.ATTRIBUTE_NUMBER_OF_FLOORS] = self.standardised_asset_list.apply(
+ lambda x: estimate_number_of_floors(
+ property_type=(
+ str(x[self.STANDARD_PROPERTY_TYPE]).title() if
+ str(x[self.STANDARD_PROPERTY_TYPE]).title() in accepted_epc_property_types else (
+ x[self.EPC_API_DATA_NAMES["property-type"]] if not
+ pd.isnull(x[self.EPC_API_DATA_NAMES["property-type"]]) else None
+ )
+ )
+ ),
+ axis=1
+ )
+
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["total-floor-area"]] = (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["total-floor-area"]].astype(float)
+ )
+ # Replace "" value with None
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] = (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]].replace("", None)
+ )
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] = (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]].astype(float)
+ )
+
+ # Estimate the perimeter
+ self.standardised_asset_list[self.ATTRIBUTE_ESTIMATED_PERIMETER] = self.standardised_asset_list.apply(
+ lambda x: estimate_perimeter(
+ floor_area=x[self.EPC_API_DATA_NAMES["total-floor-area"]] / x[self.ATTRIBUTE_NUMBER_OF_FLOORS],
+ num_rooms=x[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] / x[self.ATTRIBUTE_NUMBER_OF_FLOORS],
+ ), axis=1
+ )
+
+ self.standardised_asset_list[self.ATTRIBUTE_HEAT_LOSS_AREA] = self.standardised_asset_list.apply(
+ lambda x: estimate_external_wall_area(
+ num_floors=x[self.ATTRIBUTE_NUMBER_OF_FLOORS],
+ floor_height=(
+ float(x[self.EPC_API_DATA_NAMES["floor-height"]]) if
+ x[self.EPC_API_DATA_NAMES["floor-height"]] else 2.5
+ ),
+ perimeter=x[self.ATTRIBUTE_ESTIMATED_PERIMETER],
+ built_form=x[self.EPC_API_DATA_NAMES["built-form"]]
+ ),
+ axis=1
+ )
+
+ self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = self.standardised_asset_list.apply(
+ lambda x: RoofAttributes(description=x[self.EPC_API_DATA_NAMES["roof-description"]]).process()[
+ "insulation_thickness"] if not pd.isnull(
+ x[self.EPC_API_DATA_NAMES["roof-description"]]) else None,
+ axis=1
+ )
+ self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = (
+ self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].str.replace("+", "")
+ )
+
+ # We produce some additional fields
+ # 1) Is the SAP rating below C75
+ self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] = (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].astype(float) <=
+ self.FILLED_CAVITY_SAP_THRESHOLD
+ )
+ # 2) Flag anything where the EPC is older than 5 years
+ self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] = (
+ pd.to_datetime(
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["inspection-date"]]
+ ).dt.year < self.EPC_YEAR_THRESHOLD
+ )
+
+ self.process_age_band()
+
+ def process_age_band(self):
+ processed_age_band = []
+ for _, x in self.standardised_asset_list.iterrows():
+
+ if pd.isnull(x[self.EPC_API_DATA_NAMES["construction-age-band"]]) or (
+ x[self.EPC_API_DATA_NAMES["construction-age-band"]] in Definitions.DATA_ANOMALY_MATCHES
+ ):
+ processed_age_band.append(
+ {
+ self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
+ "epc_year_lower_bound": None,
+ "epc_year_upper_bound": None,
+ "does_age_band_match_epc_age_band": "No EPC Age Band"
+ }
+ )
+ continue
+
+ # We exatract the upper and lower bounds
+ if x[self.EPC_API_DATA_NAMES["construction-age-band"]] in [
+ "England and Wales: 2007 onwards", "England and Wales: 2012 onwards"
+ ]:
+ year_lower_bound = 2007 if x[self.EPC_API_DATA_NAMES[
+ "construction-age-band"]] == "England and Wales: 2007 onwards" else 2012
+
+ if pd.isnull(x[self.STANDARD_YEAR_BUILT]):
+ age_band_matches = "No Year Built From Landlord"
+ else:
+ age_band_matches = (
+ "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] >= year_lower_bound
+ else "EPC Age Band is older than Year Built"
+ )
+
+ processed_age_band.append(
+ {
+ self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
+ "epc_year_lower_bound": year_lower_bound,
+ "epc_year_upper_bound": None,
+ "does_age_band_match_epc_age_band": age_band_matches
+ }
+ )
+ continue
+
+ if x[self.EPC_API_DATA_NAMES["construction-age-band"]] == "England and Wales: before 1900":
+
+ if pd.isnull(x[self.STANDARD_YEAR_BUILT]):
+ age_band_matches = "No Year Built From Landlord"
+ else:
+ age_band_matches = (
+ "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] < 1900
+ else "EPC Age Band is newer than Year Built"
+ )
+
+ processed_age_band.append(
+ {
+ self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
+ "epc_year_lower_bound": None,
+ "epc_year_upper_bound": 1899,
+ "does_age_band_match_epc_age_band": age_band_matches
+ }
+ )
+ continue
+
+ if x[self.EPC_API_DATA_NAMES["construction-age-band"]].isdigit():
+
+ if pd.isnull(x[self.STANDARD_YEAR_BUILT]):
+ age_band_matches = "No Year Built From Landlord"
+ else:
+ age_band_matches = (
+ "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] == int(
+ x[self.EPC_API_DATA_NAMES["construction-age-band"]]
+ )
+ else "EPC Age Band is different from Year Built"
+ )
+
+ processed_age_band.append(
+ {
+ self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
+ "epc_year_lower_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]),
+ "epc_year_upper_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]),
+ "does_age_band_match_epc_age_band": age_band_matches
+ }
+ )
+ continue
+
+ # Oherwise, we extract the upper and lower bounds
+ age_band = x[self.EPC_API_DATA_NAMES["construction-age-band"]].split(": ")[1]
+ lower_date, upper_date = age_band.split("-")
+
+ if not x[self.STANDARD_YEAR_BUILT]:
+ age_band_matches = "No Year Built From Landlord"
+ else:
+ age_band_matches = (
+ "EPC Age Band Matches Year Built" if (x[self.STANDARD_YEAR_BUILT] >= float(lower_date)) and (
+ x[self.STANDARD_YEAR_BUILT] <= float(upper_date)
+ )
+ else "EPC Age Band is older than Year Built" if x[self.STANDARD_YEAR_BUILT] > float(upper_date)
+ else "EPC Age Band is newer than Year Built"
+ )
+
+ processed_age_band.append(
+ {
+ self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
+ "epc_year_lower_bound": int(lower_date),
+ "epc_year_upper_bound": int(upper_date),
+ "does_age_band_match_epc_age_band": age_band_matches
+ }
+ )
+
+ processed_age_band = pd.DataFrame(processed_age_band)
+
+ self.standardised_asset_list = self.standardised_asset_list.merge(
+ processed_age_band, how="left"
+ )
+
+ def identify_worktypes(self, cleaned):
+
+ if self.STANDARD_SAP is not None:
+ # We add a SAP category for all work type identification
+ self.standardised_asset_list["SAP Category"] = np.where(
+ (
+ (self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= 68) |
+ (self.standardised_asset_list[self.STANDARD_SAP] <= 68)
+ ),
+ "SAP Rating 68 or less",
+ np.where(
+ (
+ (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <=
+ self.EMPTY_CAVITY_SAP_THRESHOLD
+ ) | (self.standardised_asset_list[self.STANDARD_SAP] <= self.EMPTY_CAVITY_SAP_THRESHOLD)
+ ),
+ f"SAP Rating 69-{self.EMPTY_CAVITY_SAP_THRESHOLD}",
+ f"SAP Rating {self.EMPTY_CAVITY_SAP_THRESHOLD + 1} or more"
+ )
+ )
+ else:
+ # We add a SAP category for all work type identification
+ self.standardised_asset_list["SAP Category"] = np.where(
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= 68,
+ "SAP Rating 68 or less",
+ np.where(
+ (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <=
+ self.EMPTY_CAVITY_SAP_THRESHOLD
+ ),
+ f"SAP Rating 69-{self.EMPTY_CAVITY_SAP_THRESHOLD}",
+ f"SAP Rating {self.EMPTY_CAVITY_SAP_THRESHOLD + 1} or more"
+ )
+ )
+
+ # Before we being, we identify if a property has solar already as we use this
+ # for identifying cavity jobs
+ if self.non_intrusives_present:
+ existing_solar_non_intrusives_check = (
+ self.standardised_asset_list["non-intrusives: PV, ACCESS ISSUE, SEE NOTES"] == "SOLAR PV ON ROOF"
+ )
+ elif self.old_format_non_intrusives_present:
+ existing_solar_non_intrusives_check = (
+ self.standardised_asset_list["non-intrusives: WFT Findings"].str.lower().str.strip().isin(
+ ["solar pv on roof"]
+ )
+ )
+ else:
+ # We don't have an indication
+ existing_solar_non_intrusives_check = False
+
+ self.standardised_asset_list["property_has_solar"] = (
+ (self.standardised_asset_list[self.STANDARD_EXISTING_PV] == "already has PV") |
+ existing_solar_non_intrusives_check |
+ (self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR])
+ )
+
+ # If we have non-intrusives completed, we can use this to identify work types
+ ######################################################
+ # Empty cavity:
+ ######################################################
+ # 1) Has been flagged on the non-intrusives as being a cavity wall, empty or partially filled
+ # 2) The age is before 1995
+ # 3) We don't remove anything that haas access issues yet
+
+ if self.non_intrusives_present:
+ non_intrusives_wall_filter = (
+ (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") &
+ self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"])
+ )
+ elif self.old_format_non_intrusives_present:
+ non_intrusives_wall_filter = (
+ self.standardised_asset_list['non-intrusives: WFT Findings'].str.lower().str.strip().isin(
+ ["empty cavity", "partial fill"]
+ ) | (
+ (
+ self.standardised_asset_list['non-intrusives: WFT Findings']
+ .str.lower().str.strip().str.contains("empty cavity|partial fill") &
+ ~self.standardised_asset_list['non-intrusives: WFT Findings']
+ .astype(str).str.lower().str.strip().str.contains("major access issues")
+ )
+ )
+ )
+ else:
+ # We set the filter to False, as we have no non-intrusives
+ non_intrusives_wall_filter = False
+
+ if self.landlord_year_built is None:
+ year_built_filter = self.standardised_asset_list["epc_year_upper_bound"] <= self.EMPTY_CAVITY_YEAR_THRESHOLD
+ else:
+ year_built_filter = (
+ (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= self.EMPTY_CAVITY_YEAR_THRESHOLD) |
+ (self.standardised_asset_list["epc_year_upper_bound"] <= self.EMPTY_CAVITY_YEAR_THRESHOLD)
+ )
+
+ # Criteria:
+ # The property isn't a bedsit
+ # Non-intrusives indicate it needs a fill
+ # The EPC year is before 2002
+ # We also flag where the property has solar on the roof, because this is a signal of a high EPC rating
+ self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = (
+ (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) &
+ non_intrusives_wall_filter &
+ year_built_filter &
+ (
+ ~self.standardised_asset_list["property_has_solar"]
+ )
+ )
+
+ self.standardised_asset_list["non_intrusive_indicates_empty_cavity_has_solar"] = (
+ pd.isnull(self.standardised_asset_list["non_intrusive_indicates_empty_cavity"]) &
+ (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) &
+ non_intrusives_wall_filter &
+ year_built_filter &
+ (
+ # If the property has solar, there's a chance it won't qualify
+ self.standardised_asset_list["property_has_solar"]
+ )
+ )
+
+ # We also add a filter on anything that was generally identified by the non-intrusives
+ self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_year_filter"] = (
+ pd.isnull(self.standardised_asset_list["non_intrusive_indicates_empty_cavity"]) &
+ pd.isnull(self.standardised_asset_list["non_intrusive_indicates_empty_cavity_has_solar"]) &
+ (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) &
+ non_intrusives_wall_filter
+ )
+
+ self.standardised_asset_list["epc_indicates_empty_cavity"] = (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin(
+ self.EPC_NO_WALL_INSULATION_DESCRIPTIONS
+ ) & (
+ self.standardised_asset_list["epc_year_upper_bound"] <= self.EMPTY_CAVITY_YEAR_THRESHOLD
+ ) & (
+ ~self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD]
+ ) & (
+ ~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])
+ )
+ )
+
+ self.standardised_asset_list["landlord_data_indicates_empty_cavity"] = (
+ self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin(["uninsulated cavity"]) &
+ (
+ (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= self.EMPTY_CAVITY_YEAR_THRESHOLD) |
+ (self.standardised_asset_list["epc_year_upper_bound"] <= self.EMPTY_CAVITY_YEAR_THRESHOLD)
+ ) & (
+ ~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])
+ )
+ )
+
+ # Finally, we create a flag to indicate that the cavity is empty, based on the criteria above
+ self.standardised_asset_list["cavity_is_empty"] = (
+ non_intrusives_wall_filter |
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin(
+ self.EPC_NO_WALL_INSULATION_DESCRIPTIONS
+ ) |
+ self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin(["uninsulated cavity"])
+ )
+
+ ######################################################
+ # Extraction
+ ######################################################
+ # as needing a CIGA check. What is the logic we should be applying here?
+
+ if self.non_intrusives_present:
+
+ extraction_wall_filter = (
+ (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") &
+ (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) &
+ (~self.standardised_asset_list['non-intrusives: Material'].isin(
+ ["GREY LOOSE BEAD", "COMPACTED BEAD", "FIBRE BATT NO CAVITY", "EMPTY NARROW BELOW 30mm"]
+ ))
+ )
+
+ if self.non_intrusives_eligibility:
+ # If we have the eligibility column, we check if the wall is eligible
+ extraction_wall_filter = (
+ extraction_wall_filter &
+ ~self.standardised_asset_list["non-intrusives: Eligibility (Red/Yellow/Green)"].isin(
+ ["RED"]
+ )
+ )
+
+ self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = (
+ extraction_wall_filter & year_built_filter
+ )
+
+ elif self.old_format_non_intrusives_present:
+ print("Review these categories!!!!")
+ extraction_wall_filter = (
+ self.standardised_asset_list['non-intrusives: WFT Findings'].str.lower().str.strip().isin(
+ ["retro drilled", "retro filled", "fibre from build", "polybead", "retro drilled and filled",
+ "retro drilled & filled", "blown in white wool", "blown in yellow wool"]
+ )
+ )
+
+ self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = (
+ extraction_wall_filter
+ )
+
+ else:
+ self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = False
+ self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = False
+
+ ######################################################
+ # Solar
+ ######################################################
+ # Criteria:
+ # Check 1: Does the property have a valid heating system?
+ self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] = (
+ self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin(
+ [
+ "air source heat pump",
+ "ground source heat pump",
+ "high heat retention storage heaters",
+ "electric boiler"
+ ]
+ )
+ )
+ self.standardised_asset_list["solar_landlord_data_indicates_needs_heating_upgrade"] = (
+ self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin(
+ ["electric storage heaters", "room heaters", "electric radiators", "no heating"]
+ )
+ )
+
+ self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] = (
+ (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]]
+ .str.lower().str.contains("air source heat pump|ground source heat pump|boiler and radiators, electric")
+ ) | (
+ self.standardised_asset_list[
+ self.EPC_API_DATA_NAMES["mainheat-description"]].str.lower().str.contains(
+ "electric storage heaters"
+ ) & (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES[
+ "mainheatcont-description"]] == "Controls for high heat retention storage heaters"
+ )
+ )
+ )
+
+ self.standardised_asset_list["solar_epc_data_indicates_requires_heating_upgrade"] = (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]].str.lower().str.contains(
+ "electric storage heaters|room heaters"
+ ) & (
+ self.standardised_asset_list[
+ self.EPC_API_DATA_NAMES["mainheatcont-description"]
+ ] != "Controls for high heat retention storage heaters"
+ )
+ )
+
+ # Basic check - both of the previous two shouldn't be true simultaneously
+ if (
+ self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] &
+ self.standardised_asset_list["solar_epc_data_indicates_requires_heating_upgrade"]
+ ).sum():
+ raise ValueError("Both heating system checks are true - this should not be possible")
+
+ # Check 3: Does the property meet the fabric condition
+ # Solar PV installs are subject to the minimum insulation requirements which means:
+ # 1) one of the following insulation measures must be installed as part of the same
+ # ECO4 project:
+ # • roof insulation (flat roof, pitched roof, room-in-roof)
+ # • exterior facing wall insulation (cavity wall, solid wall)
+ # • party cavity wall insulation
+ # • floor insulation (solid and underfloor)
+ #
+ # OR
+ #
+ # all measures (except any exempted measure referred to in paragraph 4.28)
+ # listed in paragraph a) must already be installed
+ #
+ # With this in mind, we look for 2 clases
+ # 1) The property is fully insulated apart from the loft (<200mm insulation)
+ # 2) THe property is fully insulated
+
+ print("Should we include cavity properties where they might be uninsulated?")
+ self.standardised_asset_list["solar_landlord_walls_insulated"] = (
+ self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin(
+ [
+ "filled cavity", "insulated solid brick", "insulated timber frame",
+ ]
+ )
+ )
+
+ if self.non_intrusives_present:
+ self.standardised_asset_list["solar_non_intrusives_walls_insulated"] = (
+ self.standardised_asset_list["non-intrusives: Insulated"].isin(
+ ["EWI", "RETRO DRILLED", "FILLED AT BUILD"]
+ )
+ )
+ elif self.old_format_non_intrusives_present:
+ self.standardised_asset_list["solar_non_intrusives_walls_insulated"] = (
+ self.standardised_asset_list["non-intrusives: WFT Findings"].str.lower().str.strip().isin(
+ ["retro drilled", "retro filled", "ewi", "retro drilled/ solid"]
+ )
+ )
+ else:
+ self.standardised_asset_list["solar_non_intrusives_walls_insulated"] = False
+
+ # We merge on the u-value for average thermal transmittance
+ walls_uvalue_data = pd.DataFrame(cleaned["walls-description"])
+ walls_uvalue_data = walls_uvalue_data[
+ ~pd.isnull(walls_uvalue_data["thermal_transmittance"])
+ ][["original_description", "thermal_transmittance"]].rename(
+ columns={
+ "original_description": self.EPC_API_DATA_NAMES["walls-description"],
+ "thermal_transmittance": "walls_u_value"
+ }
+ )
+ self.standardised_asset_list = self.standardised_asset_list.merge(
+ walls_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["walls-description"]
+ )
+
+ self.standardised_asset_list["solar_epc_walls_insulated"] = (
+ (
+ self.standardised_asset_list[
+ self.EPC_API_DATA_NAMES["walls-description"]].str.lower().str.contains(
+ "|".join(self.EPC_INSULATED_WALLS_SUBSTRINGS)
+ )
+ ) | (
+ self.standardised_asset_list["walls_u_value"].apply(lambda x: x <= 0.7 if not pd.isnull(x) else False)
+ )
+ )
+
+ # We merge on the u-value for average thermal transmittance
+ roof_roof_data = pd.DataFrame(cleaned["roof-description"])[
+ ["original_description", "thermal_transmittance", "is_pitched", "is_loft"]
+ ].rename(
+ columns={
+ "original_description": self.EPC_API_DATA_NAMES["roof-description"],
+ "thermal_transmittance": "roof_u_value",
+ }
+ )
+
+ self.standardised_asset_list = self.standardised_asset_list.merge(
+ roof_roof_data, how="left", on=self.EPC_API_DATA_NAMES["roof-description"]
+ )
+
+ # If the u-value of a roof is less than 0.7 we consider it insulated
+ self.standardised_asset_list["solar_epc_roof_insulated"] = (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["roof-description"]].str.lower().str.contains(
+ "|".join(self.EPC_INSULATED_ROOF_SUBSTRINGS),
+ ) | (
+ self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply(
+ lambda x: int(x) >= 200 if str(x).isdigit() else False
+ )
+ ) | (
+ self.standardised_asset_list["roof_u_value"].apply(
+ lambda x: x <= 0.7 if not pd.isnull(x) else False
+ )
+ )
+ )
+
+ self.standardised_asset_list["solar_epc_loft_needs_topup"] = (
+ self.standardised_asset_list[
+ self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply(
+ lambda x: int(x) < 200 if str(x).isdigit() else False
+ ) | (
+ (
+ self.standardised_asset_list["is_loft"] | self.standardised_asset_list["is_pitched"]
+ ) & (
+ self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].isin(
+ ["below average", "none"]
+ )
+ )
+ )
+ )
+
+ self.standardised_asset_list["epc_has_floor_recommendation"] = (
+ self.standardised_asset_list["epc_has_floor_recommendation"].fillna(False)
+ )
+
+ # Check if the boiler is electric
+ # We check if it contains both the terms boiler & electric
+ self.standardised_asset_list["has_electric_boiler"] = (
+ (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]]
+ .str.lower().isin(
+ ["boiler and radiators, electric"])
+ ) | (
+ self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM] == "electric boiler"
+ )
+ )
+
+ ####################################
+ # Check solar eligibility
+ ####################################
+
+ # Set up the filters to stop repetition
+ correct_heating_system = (
+ self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] |
+ self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] |
+ self.standardised_asset_list["has_electric_boiler"]
+ )
+
+ needs_heating_upgrade = (
+ self.standardised_asset_list["solar_landlord_data_indicates_needs_heating_upgrade"] |
+ self.standardised_asset_list["solar_epc_data_indicates_requires_heating_upgrade"]
+ )
+
+ # The requirements for walls are:
+ # 1) walls are insulated
+ # 2) property is a cavity (can be done insulated or not)
+
+ walls_meet_solar_requirements = (
+ # The landlord is saying the walls are insulated
+ self.standardised_asset_list["solar_landlord_walls_insulated"] |
+ # EPC data is saying the walls are insulated
+ self.standardised_asset_list["solar_epc_walls_insulated"] |
+ # Non-intrusives are saying the walls are insulated
+ self.standardised_asset_list["solar_non_intrusives_walls_insulated"] |
+ # It's empty cavity
+ self.standardised_asset_list["cavity_is_empty"] |
+ # It's a cavity wall
+ (self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].str.contains("cavity"))
+ )
+
+ not_a_flat = (
+ self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "flat"
+ )
+
+ solar_roof_meets_criteria = (
+ self.standardised_asset_list["solar_epc_roof_insulated"] |
+ self.standardised_asset_list["solar_epc_loft_needs_topup"]
+ )
+
+ self.standardised_asset_list["solar_eligible"] = (
+ # Property isn't a flag
+ not_a_flat &
+ # Landlord data or EPC data indicates the heating system is appropriate
+ correct_heating_system &
+ # The property doesn't currently have solar
+ ~self.standardised_asset_list["property_has_solar"] &
+ # The walls are insulated
+ walls_meet_solar_requirements &
+ # Roof meets criteria
+ solar_roof_meets_criteria
+ )
+
+ # With heating upgrade
+ self.standardised_asset_list["solar_eligible_needs_heating_upgrade"] = (
+ not_a_flat &
+ # Needs heating upgrade
+ needs_heating_upgrade &
+ # The property doesn't currently have solar
+ ~self.standardised_asset_list["property_has_solar"] &
+ # The walls are insulated
+ walls_meet_solar_requirements &
+ # Roof meets criteria
+ solar_roof_meets_criteria
+ )
+
+ # We shouldn't have an overlap
+ if (
+ self.standardised_asset_list["solar_eligible"] &
+ self.standardised_asset_list["solar_eligible_needs_heating_upgrade"]
+ ).sum():
+ raise ValueError("Both heating upgrade and no heating upgrade are true - this should not be possible")
+
+ # We check for a specific sub-set of properties which are uninsulated solid wall properties that are EPC E
+ # or below (we'll use 57 as a threshold) - These are for a pilot with Net Zero Renewables
+ self.standardised_asset_list["solar_eligible_solid_wall_uninsulated"] = (
+ not_a_flat &
+ # Landlord data or EPC data indicates the heating system is appropriate - in this case, we can also take
+ # electric boilers
+ correct_heating_system &
+ # The property doesn't currently have solar
+ ~self.standardised_asset_list["property_has_solar"] &
+ # The walls are uninsulated solid
+ ~walls_meet_solar_requirements &
+ (self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= 57)
+ )
+
+ # Drop anything we don't need
+ self.standardised_asset_list = self.standardised_asset_list.drop(
+ columns=["walls_u_value", "roof_u_value"]
+ )
+
+ # Adjust flagged extraction jobs to remove anything for solar
+ self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = (
+ self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] &
+ ~self.standardised_asset_list["solar_eligible"]
+ )
+
+ # Finally, we note why each property has been flagged
+ self.standardised_asset_list["cavity_reason"] = None
+
+ empty_cavity_map = {
+ "non_intrusive_indicates_empty_cavity": "Non-Intrusive Data Shows Empty Cavity: ",
+ "non_intrusive_indicates_empty_cavity_has_solar": "Non-Intrusive Data Shows Empty Cavity - property "
+ "already has solar: ",
+ "non_intrusive_indicates_empty_cavity_no_year_filter": f"Non-Intrusive Data Shows Empty Cavity, "
+ f"built after {self.EMPTY_CAVITY_YEAR_THRESHOLD}: ",
+
+ }
+ for variable, description in empty_cavity_map.items():
+ self.standardised_asset_list["cavity_reason"] = np.where(
+ self.standardised_asset_list[variable] &
+ pd.isnull(self.standardised_asset_list["cavity_reason"]),
+ description + self.standardised_asset_list["SAP Category"],
+ self.standardised_asset_list["cavity_reason"]
+ )
+
+ # We break the cavity reason into a few different categories, when the EPC is different from inspections
+ self.standardised_asset_list["cavity_reason"] = np.where(
+ (
+ self.standardised_asset_list["epc_indicates_empty_cavity"] &
+ ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] &
+ (self.standardised_asset_list['non-intrusives: Insulated'] == "RETRO DRILLED") &
+ pd.isnull(self.standardised_asset_list["cavity_reason"])
+ ),
+ "EPC Shows Empty Cavity, inspections show retro drilled: " + self.standardised_asset_list["SAP Category"],
+ self.standardised_asset_list["cavity_reason"]
+ )
+
+ self.standardised_asset_list["cavity_reason"] = np.where(
+ (
+ self.standardised_asset_list["epc_indicates_empty_cavity"] &
+ ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] &
+ (self.standardised_asset_list['non-intrusives: Insulated'] == "FILLED AT BUILD") &
+ pd.isnull(self.standardised_asset_list["cavity_reason"])
+ ),
+ "EPC Shows Empty Cavity, inspections show filled at build: " + self.standardised_asset_list["SAP Category"],
+ self.standardised_asset_list["cavity_reason"]
+ )
+
+ self.standardised_asset_list["cavity_reason"] = np.where(
+ (
+ self.standardised_asset_list["epc_indicates_empty_cavity"] &
+ ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] &
+ pd.isnull(self.standardised_asset_list["cavity_reason"])
+ ),
+ "EPC Shows Empty Cavity, inspections show non-cavity build: " + self.standardised_asset_list[
+ "SAP Category"],
+ self.standardised_asset_list["cavity_reason"]
+ )
+ # Landlord data: The landlord's data indicates that the wall is an uninsulated cavity wall, but EPC and
+ # inspections show filled
+ self.standardised_asset_list["cavity_reason"] = np.where(
+ (
+ self.standardised_asset_list["landlord_data_indicates_empty_cavity"] &
+ ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] &
+ ~self.standardised_asset_list["epc_indicates_empty_cavity"] &
+ pd.isnull(self.standardised_asset_list["cavity_reason"])
+ ),
+ "Landlord Data Shows Empty Cavity, EPC & Inspections Shows Filled: " + self.standardised_asset_list[
+ "SAP Category"],
+ self.standardised_asset_list["cavity_reason"]
+ )
+
+ # Flag extraction
+ self.standardised_asset_list["cavity_reason"] = np.where(
+ (
+ self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] &
+ pd.isnull(self.standardised_asset_list["cavity_reason"])
+ ),
+ "Non-Intrusive Data Shows Cavity Extraction: " + self.standardised_asset_list["SAP Category"],
+ self.standardised_asset_list["cavity_reason"]
+ )
+
+ ######################################################
+ # Flag solar
+ ######################################################
+ self.standardised_asset_list["solar_reason"] = None
+
+ # Map of variables and fill values for the solar_reason variable
+ solar_reason_map = {
+ "solar_eligible": "Solar Eligible: ",
+ "solar_eligible_needs_heating_upgrade": (
+ "Solar Eligible, Solid Floor, Needs Heating Upgrade: "
+ ),
+ "solar_eligible_solid_wall_uninsulated": "Solar Eligible, Solid Wall Uninsulated, EPC E or Below: ",
+ }
+
+ for variable, reason in solar_reason_map.items():
+ self.standardised_asset_list["solar_reason"] = np.where(
+ self.standardised_asset_list[variable],
+ reason + self.standardised_asset_list["SAP Category"],
+ self.standardised_asset_list["solar_reason"]
+ )
+
+ # Flag anything that has existing outcomes
+ if (self.outcomes is not None) and ("Surveyed" in self.standardised_asset_list.columns):
+
+ if "Installer Refusal" not in self.standardised_asset_list.columns:
+ self.standardised_asset_list["cavity_reason"] = np.where(
+ (
+ (self.standardised_asset_list["Surveyed"] > 0)
+ ),
+ None,
+ self.standardised_asset_list["cavity_reason"]
+ )
+ else:
+ self.standardised_asset_list["cavity_reason"] = np.where(
+ (
+ (self.standardised_asset_list["Surveyed"] > 0) |
+ (self.standardised_asset_list["Installer Refusal"] > 0)
+ ),
+ None,
+ self.standardised_asset_list["cavity_reason"]
+ )
+
+ if self.master_surveyed is not None:
+ self.standardised_asset_list["cavity_reason"] = np.where(
+ (
+ (~pd.isnull(self.standardised_asset_list["submission_date"]))
+ ),
+ None,
+ self.standardised_asset_list["cavity_reason"]
+ )
+
+ blocks_of_flats = self.standardised_asset_list[
+ self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats"
+ ]
+
+ non_blocks_of_flats = self.standardised_asset_list[
+ self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats"
+ ]
+
+ # Produce some aggregate figures
+ self.work_type_figures = {
+ **non_blocks_of_flats["cavity_reason"].value_counts().to_dict(),
+ **{
+ k + " (Block of flats)": v for k, v in
+ blocks_of_flats["solar_reason"].value_counts().to_dict().items()
+ },
+ **self.standardised_asset_list["solar_reason"].value_counts().to_dict()
+ }
+
+ # We prepare outcomes for output
+ if self.outcomes is not None:
+ logger.info("Preparing outcomes for output")
+ identified_work = self.standardised_asset_list[
+ ~pd.isnull(self.standardised_asset_list["cavity_reason"]) |
+ ~pd.isnull(self.standardised_asset_list["solar_reason"])
+ ][self.DOMNA_PROPERTY_ID].values
+
+ if self.DOMNA_PROPERTY_ID in self.outcomes.columns:
+ self.outcomes_for_output = self.outcomes[
+ self.outcomes[self.DOMNA_PROPERTY_ID].isin(identified_work)
+ ]
+
+ def flat_analysis(self):
+
+ # We need to deduce the building name - we strip out the house number
+
+ # We want to deduce if flats have 50% of the properties below C75
+ # We group by postcode and property type
+ grouped = self.standardised_asset_list.groupby(
+ [self.STANDARD_POSTCODE, self.STANDARD_PROPERTY_TYPE]
+ )
+
+ flat_data = []
+ for _, group in grouped:
+ if "flat" in group[self.STANDARD_PROPERTY_TYPE].values:
+ num_flats = group[self.STANDARD_PROPERTY_TYPE].shape[0]
+ num_below_c75 = group[
+ self.EPC_API_DATA_NAMES["current-energy-efficiency"]
+ ].lt(self.FILLED_CAVITY_SAP_THRESHOLD).sum()
+ # Check if any flats are below C69
+ num_flats_below_c69 = group[
+ self.EPC_API_DATA_NAMES["current-energy-efficiency"]
+ ].lt(69).sum()
+
+ flat_data.append(
+ {
+ "Postcode": group[self.STANDARD_POSTCODE].iloc[0],
+ "Property Type": "Flat",
+ "Number of Flats with EPC": num_flats,
+ "Number of Flats below C75": num_below_c75,
+ "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats),
+ "Number of Flats Below C69": num_flats_below_c69,
+ }
+ )
+
+ flat_data = pd.DataFrame(flat_data)
+
+ self.flat_data = flat_data
+
+ @staticmethod
+ def split_full_name(x):
+ if pd.isnull(x):
+ return None, None, None
+ x = x.lower()
+ titles = ["mr", "mrs", "ms", "miss", "dr", "prof"]
+ # Remove titles
+ detected_title = [title for title in titles if x.startswith(title)]
+ if detected_title:
+ for title in detected_title:
+ x = x.replace(title, "")
+ x = x.strip()
+ first_name, last_name = x.split(" ")[0], x.split(" ")[-1]
+ title = detected_title[0].title() if detected_title else None
+ return title, first_name.title(), last_name.title()
+
+ def load_contact_details(
+ self,
+ local_filepath,
+ sheet_name,
+ landlord_property_id,
+ phone_number_column=None,
+ email_column=None,
+ fullname_column=None,
+ firstname_column=None,
+ lastname_column=None
+ ):
+
+ self.contact_detail_fields = {
+ "landlord_property_id": landlord_property_id,
+ "phone_number": phone_number_column,
+ "email": email_column,
+ "fullname": fullname_column,
+ "firstname": firstname_column,
+ "lastname": lastname_column
+ }
+
+ details_colnames = [
+ phone_number_column, email_column, fullname_column, firstname_column, lastname_column
+ ]
+ # We'll fill them
+ none_details = [x for x in details_colnames if x is None]
+ details_colnames = [x for x in details_colnames if x is not None]
+
+ contact_details = pd.read_excel(
+ local_filepath, sheet_name=sheet_name
+ )[[self.contact_detail_fields["landlord_property_id"]] + details_colnames]
+ contact_details = contact_details[
+ ~pd.isnull(contact_details[self.contact_detail_fields["landlord_property_id"]])
+ ]
+ # Fill anything we don't have
+ for detail in none_details:
+ contact_details[detail] = None
+
+ if fullname_column and not (firstname_column and lastname_column):
+ contact_details["title"], contact_details["first_name"], contact_details["last_name"] = zip(
+ *contact_details[fullname_column].apply(self.split_full_name)
+ )
+ else:
+ raise NotImplementedError("Implement me")
+
+ self.contact_details = contact_details
+
+ def prepare_for_crm(self, company_domain, crm_pipeline_name, first_dealstage, assigned_surveyors):
+ """
+ This function prepares the data for upload into Hubspot
+ :return:
+ """
+ # This is a placeholder for now
+
+ # This maps the opportunities as we reference them, to the product data as stored in Hubspot
+ product_lookup_table = {
+ "Non-Intrusive Data Showed Cavity Extraction": {
+ "name": "Extract & Fill - ECO4", "id": 100307905778, "unit_price": 500
+ },
+ "Non-Intrusive Data Showed Empty Cavity": {
+ "name": "Empty Cavity & Loft - ECO4", "id": 82733738177, "unit_price": 1000
+ },
+ "Non-Intrusive Data Showed Empty Cavity but all SAP scores allowed": {
+ "name": "Empty Cavity & Loft - ECO4", "id": 82733738177, "unit_price": 1000
+ },
+ "Non-Intrusive Data Showed Cavity Extraction but all SAP scores allowed": {
+ "name": "Extract & Fill - ECO4", "id": 100307905778, "unit_price": 500
+ },
+ "EPC Data Showed Empty Cavity": {
+ "name": "Empty Cavity & Loft - ECO4", "id": 82733738177, "unit_price": 1000
+ },
+ "Solid Floor, Insulated, No Solar": {
+ "name": "Solar PV - ECO4", "id": 82623589564, "unit_price": 1608
+ },
+ "Solid Floor, Insulated, Needs Loft": {
+ "name": "Solar PV - ECO4", "id": 82623589564, "unit_price": 1608
+ },
+ "Other Floor, Insulated, No Solar": {
+ "name": "Solar PV - ECO4", "id": 82623589564, "unit_price": 1608
+ },
+ "Other Floor, Insulated, Needs Loft": {
+ "name": "Solar PV - ECO4", "id": 82623589564, "unit_price": 1608
+ }
+ }
+ # We check if all products are covered in the lookup table
+ cavity_products = self.standardised_asset_list["cavity_reason"].unique()
+ solar_products = self.standardised_asset_list["solar_reason"].unique()
+ # Check if there any options not in out lookup table
+ if (
+ any(x for x in cavity_products if x not in product_lookup_table) or
+ any(x for x in solar_products if x not in product_lookup_table)
+ ):
+ raise ValueError("We have products not referenced in the lookup table - check this")
+
+ programme_data = self.standardised_asset_list.copy()
+
+ # Exclusions - these are properties we won't treat for the moment
+ product_exclusions = [
+ "Other Floor, Insulated, No Solar",
+ "Other Floor, Insulated, Needs Loft"
+ ]
+ if product_exclusions:
+ logger.warning("Excluding products: %s", product_exclusions)
+
+ programme_data = programme_data[programme_data["solar_reason"].isin(product_exclusions) == False]
+
+ # Merge on the contact details
+ programme_data = programme_data.merge(
+ self.contact_details,
+ how="left",
+ left_on=self.STANDARD_LANDLORD_PROPERTY_ID,
+ right_on=self.landlord_property_id,
+ )
+
+ programme_data["Company Domain Name "] = company_domain
+ # Append the product data onto the programme data
+ programme_data["cavity_product"] = programme_data["cavity_reason"].map(
+ lambda x: product_lookup_table.get(x, {"name": None})["name"]
+ )
+ programme_data["solar_product"] = programme_data["solar_reason"].map(
+ lambda x: product_lookup_table.get(x, {"name": None})["name"]
+ )
+
+ programme_data["domna_product"] = programme_data["solar_reason"].copy()
+ programme_data["domna_product"] = np.where(
+ pd.isnull(programme_data["domna_product"]),
+ programme_data["solar_product"],
+ programme_data["domna_product"]
+ )
+ # We filter just on rows where we have a product
+ programme_data = programme_data[
+ ~pd.isnull(programme_data["domna_product"])
+ ]
+ programme_data = programme_data.drop(columns=["solar_product", "cavity_product"])
+
+ product_df = (
+ pd.DataFrame(product_lookup_table).T[["name", "id", "unit_price"]]
+ .reset_index()
+ .rename(
+ columns={
+ "name": "Name ",
+ "id": 'Product ID ',
+ "unit_price": 'Unit price ',
+ "index": "domna_product"
+ }
+ )
+ )
+
+ product_df['Quantity '] = 1
+
+ # Append on the product data
+ programme_data = programme_data.merge(
+ product_df,
+ how="left",
+ on="domna_product",
+ )
+
+ # Add in deal and pipeline information
+ programme_data["dealname"] = programme_data[self.STANDARD_FULL_ADDRESS] + " : " + programme_data[
+ "domna_product"]
+ programme_data['Pipeline '] = crm_pipeline_name
+ programme_data['Deal Stage '] = first_dealstage
+ programme_data['Associations: Listing'] = "Property Owner"
+
+ programme_data = programme_data.merge(
+ assigned_surveyors.rename(
+ columns={self.landlord_property_id: self.STANDARD_LANDLORD_PROPERTY_ID}
+ ), how="left", on=self.STANDARD_LANDLORD_PROPERTY_ID
+ )
+
+ # This maps the hubspot schema to the template. Anything that is not covered in this will be flagged
+ schema_mappings = {
+ 'Name ': self.DOMNA_PROPERTY_ID, # TODO: Maybe change this?
+ 'Company Domain Name ': 'Company Domain Name ',
+ 'Email ': (
+ self.contact_detail_fields["email"] if self.contact_detail_fields["email"] else None
+ ), # TODO: Review
+ 'First Name ': (
+ self.contact_detail_fields["firstname"] if self.contact_detail_fields["firstname"] else None
+ ), # TODO: Review
+ 'Last Name ': (
+ self.contact_detail_fields["lastname"] if self.contact_detail_fields["lastname"] else None
+ ), # TODO: Review
+ 'Phone ': (
+ self.contact_detail_fields["phone_number"] if self.contact_detail_fields["phone_number"] else None
+ ), # TODO: Review
+ 'Full Address ': self.STANDARD_FULL_ADDRESS,
+ 'Address 1 ': self.STANDARD_ADDRESS_1,
+ 'Address 2 ': None, # TODO: Don't have this for the moment
+ 'Postcode ': self.STANDARD_POSTCODE,
+ 'Property Type ': self.STANDARD_PROPERTY_TYPE,
+ 'Property Sub Type ': None, # TODO: Don't have this for the moment
+ 'Bedroom(s) ': None, # TODO: Don't have this for the moment
+ 'Domna Property ID ': self.DOMNA_PROPERTY_ID,
+ 'National UPRN ': (
+ self.STANDARD_UPRN if self.STANDARD_UPRN is not None else self.EPC_API_DATA_NAMES["uprn"]
+ ),
+ 'Owner Property ID ': self.STANDARD_LANDLORD_PROPERTY_ID,
+ 'Wall Construction ': self.STANDARD_WALL_CONSTRUCTION,
+ 'Heating System ': self.STANDARD_HEATING_SYSTEM,
+ 'Year Built ': self.STANDARD_YEAR_BUILT,
+ 'Boiler Make ': None, # TODO: Don't have this for the moment
+ 'Boiler Model ': None, # TODO: Don't have this for the moment
+ 'Non-Intrusives: Date Checked ': None,
+ # TODO: Don't have this for the moment
+ 'Non-Intrusives: Wall Type ': (
+ "non-intrusives: Construction" if self.non_intrusives_present else None
+ ),
+ 'Non-intrusives: Insulation ': (
+ "non-intrusives: Insulated" if self.non_intrusives_present else None
+ ),
+ 'Non-intrusives: Insulation Material ': (
+ "non-intrusives: Material" if self.non_intrusives_present else None
+ ),
+ 'Non-Intrusives: CIGA Check Required ': (
+ 'non-intrusives: CIGA Check Required' if self.non_intrusives_present else None
+ ),
+ 'Non-Intrusives: PV Access Issues ': (
+ 'non-intrusives: PV, ACCESS ISSUE, SEE NOTES' if self.non_intrusives_present else None
+ ),
+ 'Non-Intrusives: Roof Orientation ': (
+ 'non-intrusives: OFF GAS - ROOF ORIENTATION' if self.non_intrusives_present else None
+ ),
+ 'Non-Intrusives: Surveyor Notes ': (
+ 'non-intrusives: Any further surveyor notes' if self.non_intrusives_present else None
+ ),
+ 'Non-Intrusives: Surveyor Name ': (
+ 'non-intrusives: Surveyors Name' if self.non_intrusives_present else None
+ ),
+ 'CIGA: Date Requested ': None, # TODO: Don't have this for the moment
+ 'CIGA: Cavity Guarantee Found ': None,
+ 'Last EPC: Is Estimated ': self.EPC_API_DATA_NAMES["estimated"],
+ 'Last EPC: EPC Rating ': self.EPC_API_DATA_NAMES["current-energy-rating"],
+ 'Last EPC: SAP Rating ': self.EPC_API_DATA_NAMES["current-energy-efficiency"],
+ 'Last EPC: Main Heating Description ': self.EPC_API_DATA_NAMES[
+ "mainheat-description"],
+ 'Last EPC: Heating Controls ': self.EPC_API_DATA_NAMES[
+ "mainheatcont-description"],
+ 'Last EPC: Lodgement Date ': self.EPC_API_DATA_NAMES["inspection-date"],
+ 'Last EPC: Floor Area ': self.EPC_API_DATA_NAMES["total-floor-area"],
+ 'Last EPC: Wall ': self.EPC_API_DATA_NAMES["walls-description"],
+ 'Last EPC: Roof ': self.EPC_API_DATA_NAMES["roof-description"],
+ 'Last EPC: Floor ': self.EPC_API_DATA_NAMES["floor-description"],
+ 'Last EPC: Room Height ': self.EPC_API_DATA_NAMES["floor-height"],
+ 'Last EPC: Age Band ': self.EPC_API_DATA_NAMES["construction-age-band"],
+ 'Deal Stage ': 'Deal Stage ',
+ 'Pipeline ': 'Pipeline ',
+ 'Expected Commencement Date ': None, # TODO: Need to set this,
+ 'Deal Name ': "dealname", # Need to create this,
+ 'Product ID ': 'Product ID ',
+ 'Name ': 'Name ',
+ 'Unit price ': 'Unit price ',
+ 'Quantity ': 'Quantity ',
+ 'Deal Owner': 'surveyor_email',
+ 'Amount ': 'Unit price ',
+ }
+
+ # We now create the finalised dataset to be uploaded into Hubspot
+ variables_required = list(schema_mappings.values())
+ variables_required = [v for v in variables_required if v is not None]
+ # We now flag anything that has a none value, which is information we haven't got right now
+ none_variables = [k for k, v in schema_mappings.items() if v is None]
+ # We'll add placeholder columns for the None variables
+ programme_data = programme_data[variables_required]
+ for col in none_variables:
+ programme_data[col] = None
+
+ programme_data = programme_data.rename(
+ columns={v: k for k, v in schema_mappings.items() if v is not None}
+ )
+
+ self.hubspot_data = programme_data
+
+ def flag_outcomes(
+ self,
+ outcomes_filepath,
+ outcomes_sheetname,
+ outcomes_address,
+ outcomes_postcode,
+ outcomes_houseno,
+ outcomes_id
+ ):
+ if outcomes_filepath is None:
+ return
+
+ self.outcomes = pd.read_excel(outcomes_filepath, sheet_name=outcomes_sheetname)
+ self.outcomes["row_id"] = self.outcomes.index
+
+ if outcomes_houseno is None:
+ outcomes_houseno = "houseno"
+ self.outcomes["houseno"] = self.outcomes[outcomes_address].apply(
+ lambda x: SearchEpc.get_house_number(x, self.outcomes[outcomes_postcode])
+ )
+
+ logger.info("Matching outcomes to asset list")
+ # Merge the outcomes onto the asset list - we check we're able to match sufficiently well
+ lookup = []
+ nomatch = []
+ for _, x in tqdm(self.outcomes.iterrows(), total=len(self.outcomes)):
+
+ if pd.isnull(x[outcomes_address]):
+ continue
+
+ # Check if we have an id
+ oid = x[outcomes_id] if outcomes_id is not None else None
+
+ if oid is not None:
+ matched = self.standardised_asset_list[
+ (self.standardised_asset_list[
+ self.STANDARD_LANDLORD_PROPERTY_ID
+ ].str.strip() == oid)
+ ]
+
+ if matched.shape[0] == 1:
+ lookup.append(
+ {
+ "row_id": x["row_id"],
+ self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0]
+ }
+ )
+ continue
+
+ address_clean = x[outcomes_address].lower().replace(",", "").replace(" ", " ")
+
+ self.outcomes["Outcome"] = self.outcomes["Outcome"].str.lower()
+
+ matched = self.standardised_asset_list[
+ (self.standardised_asset_list[
+ self.STANDARD_FULL_ADDRESS
+ ].str.lower().str.replace(",", "").str.replace(" ", " ") == address_clean)
+ ]
+
+ if matched.shape[0] == 1:
+ lookup.append(
+ {
+ "row_id": x["row_id"],
+ self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0]
+ }
+ )
+ continue
+
+ matched = self.standardised_asset_list[
+ (self.standardised_asset_list[self.STANDARD_POSTCODE].str.strip() == x[outcomes_postcode])
+ ].copy()
+ if not matched.empty:
+ matched["houseno"] = matched.apply(
+ lambda x: SearchEpc.get_house_number(
+ str(x[self.STANDARD_ADDRESS_1]), str(x[self.STANDARD_POSTCODE])
+ ),
+ axis=1
+ )
+
+ matched = matched[
+ matched["houseno"].astype(str) == str(x[outcomes_houseno])
+ ]
+ if matched.shape[0] == 1:
+ lookup.append(
+ {
+ "row_id": x["row_id"],
+ self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0]
+ }
+ )
+ continue
+ elif not matched.empty:
+ # Use levenstein distance to match
+ matched["address"] = matched[self.STANDARD_ADDRESS_1] + " " + matched[self.STANDARD_POSTCODE]
+
+ best_match = process.extractOne(x["Address"], matched[self.STANDARD_FULL_ADDRESS].values)[0]
+ matched = matched[matched[self.STANDARD_FULL_ADDRESS] == best_match]
+ lookup.append(
+ {
+ "row_id": x["row_id"],
+ self.DOMNA_PROPERTY_ID: matched[self.DOMNA_PROPERTY_ID].values[0]
+ }
+ )
+ continue
+
+ nomatch.append(x["row_id"])
+
+ self.outcomes_no_match = self.outcomes[self.outcomes["row_id"].isin(nomatch)]
+ lookup = pd.DataFrame(lookup)
+
+ if lookup.empty:
+ return
+
+ # We will have duplicated domna property IDs, where a surveyor has been to a property multiple times
+ # Where we have multiple rows, we want to make a call on what the action should be. For example,
+ # there may be properties that have been visited multiple times where the outcome was "See notes" implying
+ # that the surveyor had a detailed explanation as to why they couldn't gain access so if this has
+ # happened multiple times, in this case we judge that the work may not be viable
+
+ date_col = "Week Commencing" if "Week Commencing" in self.outcomes else "Survey Date"
+
+ lookup = lookup.merge(
+ self.outcomes[["row_id", "Outcome", "Notes", date_col]], how="left", on="row_id"
+ )
+
+ visit_counts = (
+ lookup.groupby(self.DOMNA_PROPERTY_ID)["row_id"]
+ .count()
+ .reset_index()
+ .rename(columns={"row_id": "visit_count"})
+ .sort_values("visit_count", ascending=False)
+ )
+
+ pivot_df = lookup.groupby(["domna_property_id", "Outcome"]).size().unstack(fill_value=0).reset_index()
+ pivot_df = pivot_df.merge(
+ visit_counts, how="left", on="domna_property_id"
+ )
+
+ if pivot_df[self.DOMNA_PROPERTY_ID].duplicated().sum():
+ raise Exception("We have duplicated property IDs in the outcomes data")
+
+ # We merge this data onto outcomes
+ self.outcomes["matched_to_asset_list"] = self.outcomes["row_id"].isin(lookup["row_id"].values)
+ self.outcomes = self.outcomes.merge(lookup[["row_id", "domna_property_id"]], how="left", on="row_id")
+
+ # We merge out pivoted outcomes onto the asset list
+ self.standardised_asset_list = self.standardised_asset_list.merge(
+ pivot_df, how="left", left_on=self.DOMNA_PROPERTY_ID, right_on="domna_property_id"
+ )
+
+ self.outcomes = self.outcomes.sort_values("domna_property_id", ascending=False)
+
+ def flag_survey_master(
+ self,
+ master_filepaths,
+ master_to_asset_list_filepath=None
+ ):
+ # TODO: This probably needs further expansion
+
+ if not master_filepaths:
+ return
+
+ if master_to_asset_list_filepath is not None:
+ id_map = pd.read_csv(master_to_asset_list_filepath)
+ else:
+ id_map = pd.DataFrame()
+
+ logger.info("Getting masters and merging onto asset list")
+ master_surveyed = []
+ unmatched_submissions = []
+ for filepath in master_filepaths:
+ master_data = pd.read_csv(filepath)
+ # Strip columns
+ master_data.columns = [c.strip() for c in master_data.columns]
+
+ if not id_map.empty:
+ master_data = master_data.merge(
+ id_map, how="left", on=['NO.', 'Street / Block Name', 'Post Code']
+ )
+
+ install_col = (
+ "INSTALLED OR CANCELLED" if "INSTALLED OR CANCELLED" in master_data.columns
+ else "INSTALL / CANCELLATION DATE"
+ )
+
+ submission_col = (
+ "SUBMISSION DATE" if "SUBMISSION DATE" in master_data.columns else "SUBMISSION DATE TO INSTALLERS"
+ )
+
+ if "UPRN" in master_data.columns:
+ # We just need to check if any were cancelled
+ master_to_append = master_data[
+ ["UPRN", install_col, submission_col]
+ ].rename(
+ columns={
+ "UPRN": self.STANDARD_LANDLORD_PROPERTY_ID,
+ install_col: "survey_status",
+ submission_col: "submission_date"
+ }
+ )
+ master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel")
+
+ master_surveyed.append(master_to_append)
+ continue
+
+ master_data["row_id"] = master_data.index
+
+ self.standardised_asset_list["house_no"] = self.standardised_asset_list.apply(
+ lambda x: SearchEpc.get_house_number(
+ str(x[self.STANDARD_ADDRESS_1]), str(x[self.STANDARD_POSTCODE])
+ ),
+ axis=1
+ )
+
+ postcode_col = "POSTCODE" if "POSTCODE" in master_data.columns else "Post Code"
+ house_no_col = 'NO.' if 'NO.' in master_data.columns else "NO"
+
+ # Otherwise, we need to match algorithmically
+ logger.info("Matching master data to asset list")
+ matched = []
+ unmatched = []
+ for _, row in tqdm(master_data.iterrows(), total=len(master_data)):
+ if pd.isnull(row[postcode_col]):
+ continue
+ postcode_no_space = row[postcode_col].strip().replace(" ", "").lower()
+
+ df = self.standardised_asset_list[
+ (
+ self.standardised_asset_list[self.STANDARD_POSTCODE].str.strip().str.lower().str.replace(" ",
+ "")
+ == postcode_no_space
+ )
+ ]
+
+ house_no = row[house_no_col]
+
+ if house_no in df["house_no"].values:
+ df = df[df["house_no"] == house_no]
+ if df.shape[0] != 1:
+ # Levenstein distance
+
+ if any(df[self.STANDARD_FULL_ADDRESS].str.contains(row["Street / Block Name"])):
+ df = df[
+ df[self.STANDARD_FULL_ADDRESS].str.contains(row["Street / Block Name"])
+ ]
+ else:
+ # Levenstein distance
+ df = df[
+ df[self.STANDARD_FULL_ADDRESS].str.lower().apply(
+ lambda x: process.extractOne(
+ " ".join([row[house_no_col], row["Street / Block Name"], row["TOWN"]]).lower(),
+ x
+ )[1]
+ ) > 90
+ ]
+
+ if df.shape[0] == 0:
+ unmatched.append(row["row_id"])
+ continue
+
+ if any(df[self.STANDARD_FULL_ADDRESS].str.lower().str.contains(
+ " ".join([row[house_no_col], row["Street / Block Name"]]).lower()
+ )):
+ df = df[
+ df[self.STANDARD_FULL_ADDRESS].str.lower().str.contains(
+ " ".join([row[house_no_col], row["Street / Block Name"]]).lower()
+ )
+ ]
+
+ if any(
+ df[self.STANDARD_PROPERTY_TYPE].str.contains(
+ row["PROPERTY TYPE As per table emailed"].split(" ")[-1].lower()
+ )
+ ):
+ # We ignore "block of flats" entries
+ df = df[
+ df[self.STANDARD_PROPERTY_TYPE].str.contains(
+ row["PROPERTY TYPE As per table emailed"].split(" ")[-1].lower()
+ ) & (df[self.STANDARD_PROPERTY_TYPE] != "block of flats")
+ ]
+
+ if df.shape[0] != 1:
+ # We have multiple matches
+ raise NotImplementedError("FIX ME")
+ matched.append(
+ {
+ "row_id": row["row_id"],
+ self.STANDARD_LANDLORD_PROPERTY_ID: df[self.STANDARD_LANDLORD_PROPERTY_ID].values[0],
+ }
+ )
+
+ self.standardised_asset_list = self.standardised_asset_list.drop(columns="house_no")
+
+ # We match the "UPRN" which is the landlords ID, onto the master sheet
+ matched = pd.DataFrame(matched)
+ master_to_append = master_data[["row_id", install_col, submission_col]].merge(
+ matched, how="left", on="row_id"
+ ).rename(
+ columns={
+ install_col: "survey_status",
+ submission_col: "submission_date"
+ }
+ )
+ master_to_append["cancelled"] = master_to_append["survey_status"].str.lower().str.contains("cancel")
+ master_surveyed.append(master_to_append)
+ unmatched_df = master_data[
+ master_data["row_id"].isin(unmatched)
+ ]
+
+ scheme_col = (
+ "AFFORDABLE WARMTH OR EPC FOR HOUSING ASSOCIATION" if
+ "AFFORDABLE WARMTH OR EPC FOR HOUSING ASSOCIATION" in master_data.columns else "AFFORDABLE WARMTH"
+ )
+ # The columns are massively different - we take just a few
+ unmatched_df = unmatched_df[
+ [
+ scheme_col, house_no_col, "Street / Block Name", postcode_col, install_col, submission_col
+ ]
+ ].rename(
+ columns={
+ scheme_col: "Funding Scheme",
+ house_no_col: "House Number",
+ postcode_col: "Postcode",
+ install_col: "survey_status",
+ submission_col: "submission_date"
+ }
+ )
+
+ unmatched_submissions.append(unmatched_df)
+
+ master_surveyed = pd.concat(master_surveyed)
+ master_surveyed = master_surveyed[~pd.isnull(master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID])]
+ master_surveyed = master_surveyed[
+ ~master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID].isin(
+ ["NOT ON ASSET LIST", "Missing From Asset List"]
+ )
+ ]
+
+ master_surveyed[self.STANDARD_LANDLORD_PROPERTY_ID] = master_surveyed[
+ self.STANDARD_LANDLORD_PROPERTY_ID
+ ].astype(str)
+
+ # We de-dupe crudely on landlord property id
+ self.master_surveyed = master_surveyed.drop_duplicates(subset=[self.STANDARD_LANDLORD_PROPERTY_ID])
+
+ self.standardised_asset_list = self.standardised_asset_list.merge(
+ self.master_surveyed, how="left", on=self.STANDARD_LANDLORD_PROPERTY_ID
+ )
+
+ # Finally, we keep a record of the unmatched
+ if unmatched_submissions:
+ self.unmatched_submissions = pd.concat(
+ unmatched_submissions
+ )
diff --git a/asset_list/DataMapper.py b/asset_list/DataMapper.py
new file mode 100644
index 00000000..ac1b8db3
--- /dev/null
+++ b/asset_list/DataMapper.py
@@ -0,0 +1,178 @@
+# OpenAI API Key (set this in your environment variables for security)
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+
+
+class DataRemapper:
+ def __init__(self, standard_values, standard_map=None, max_tokens=1000):
+ """
+ Initialize the remapper with standard values and a predefined mapping.
+
+ :param standard_values: Set of allowed standardized values.
+ :param standard_map: Dictionary of common remappings {raw_value: standard_value}.
+ """
+ self.standard_values = standard_values
+ self.standard_map = standard_map
+ self.fuzzy_threshold = 90 # Adjust fuzzy matching sensitivity
+ self.ai_model = "gpt-4-turbo" # Use gpt-3.5-turbo for cheaper processing
+
+ # Tokenizer for counting tokens
+ self.tokenizer = tiktoken.encoding_for_model(self.ai_model)
+
+ # Track token usage and remap dictionary
+ self.total_tokens_used = 0
+ self.total_cost = 0
+ self.remap_dict = {} # {original_value: standardized_value}
+ self.max_tokens = max_tokens # Limit for OpenAI API
+
+ # Memoization for AI calls
+ self.ai_cache = {} # {tuple(unmapped_values): {original_value: standardized_value}}
+ # Capture the reponse for debugging
+ self.ai_response = None
+
+ # OpenAI pricing (as of Feb 2024)
+ self.pricing = {
+ "gpt-4-turbo": {"input": 0.01 / 1000, "output": 0.03 / 1000},
+ "gpt-3.5-turbo": {"input": 0.0015 / 1000, "output": 0.002 / 1000},
+ }
+
+ self.openai_client = OpenAI(api_key=OPENAI_API_KEY)
+
+ @staticmethod
+ def clean_string(text):
+ """Basic text cleaning: remove extra spaces, punctuation, and normalize case."""
+ if not isinstance(text, str):
+ return None
+ text = text.strip().lower()
+ text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
+ # Replace double strings
+ text = re.sub(r'\s+', ' ', text)
+ return text
+
+ def fuzzy_match(self, text):
+ """Use fuzzy matching to find the closest standard value."""
+ match, score = process.extractOne(text, self.standard_values) if text else (None, 0)
+ return match if score >= self.fuzzy_threshold else None
+
+ def count_tokens(self, text):
+ """Estimate the number of tokens in a given text."""
+ return len(self.tokenizer.encode(text)) if text else 0
+
+ def ai_standardize(self, unmapped_values):
+ """Call OpenAI API **once** for all unmapped values to minimize cost, with memoization."""
+ if not unmapped_values:
+ return {}
+
+ unmapped_tuple = tuple(sorted(unmapped_values)) # Ensure consistency for memoization
+ if unmapped_tuple in self.ai_cache:
+ return self.ai_cache[unmapped_tuple] # Return memoized result
+
+ prompt = f"""
+ You are an expert in data classification. Standardize each of these values into one of the categories:
+ {list(self.standard_values)}.
+
+ Return only a JSON dictionary where:
+ - The keys are the original values.
+ - The values are the standardized ones.
+
+ Strictly return JSON **without markdown formatting** or extra text.
+
+ Example Output:
+ {{
+ "BLKHOUS": "block house",
+ "BEDSIT": "bedsit"
+ }}
+
+ Values to standardize:
+ {unmapped_values}
+ """
+
+ # Count input tokens
+ input_tokens = self.count_tokens(prompt)
+ if input_tokens > self.max_tokens:
+ raise ValueError("Input tokens exceed the maximum limit.")
+
+ logger.info("Calling OpenAI API for standardization...")
+ response = self.openai_client.chat.completions.create(
+ model=self.ai_model,
+ messages=[{"role": "user", "content": prompt}],
+ max_tokens=self.max_tokens,
+ temperature=0.1,
+ )
+
+ output_text = response.choices[0].message.content.strip()
+ output_tokens = self.count_tokens(output_text) # Count output tokens
+
+ # Track total token usage
+ self.total_tokens_used += input_tokens + output_tokens
+
+ # Estimate cost
+ input_cost = input_tokens * self.pricing[self.ai_model]["input"]
+ output_cost = output_tokens * self.pricing[self.ai_model]["output"]
+ self.total_cost += input_cost + output_cost
+
+ try:
+ # Parse response as dictionary
+ mapping = eval(output_text) # OpenAI should return a valid dictionary
+ except:
+ mapping = {val: "unknown" for val in unmapped_values} # Fallback
+
+ # Memoize the AI response
+ self.ai_cache[unmapped_tuple] = mapping
+ # We store the raw AI response for debugging
+ logger.debug(f"AI Response: {mapping}")
+ self.ai_response = output_text
+
+ return mapping
+
+ def standardize_list(self, values_to_remap):
+ """
+ Standardizes a list of values and returns a dictionary {original_value: standardized_value}.
+
+ :param values_to_remap: List of raw values to standardize.
+ :return: Dictionary {original_value: standardized_value}.
+ """
+ unique_values = set(values_to_remap) # Process only unique values
+
+ unmapped_values = []
+ for value in unique_values:
+ if pd.isna(value): # Handle NaN values
+ self.remap_dict[value] = "unknown"
+ continue
+
+ cleaned_value = self.clean_string(value)
+
+ # Rule-Based Check (Predefined Mapping)
+ if cleaned_value in self.standard_map or value in self.standard_map:
+ self.remap_dict[value] = (
+ self.standard_map[cleaned_value] if cleaned_value in self.standard_map else self.standard_map[value]
+ )
+ continue
+
+ if value.lower() in self.standard_map:
+ self.remap_dict[value] = self.standard_map[value.lower()]
+ continue
+
+ # Exact Match in Standard Values
+ if cleaned_value in self.standard_values:
+ self.remap_dict[value] = cleaned_value
+ continue
+
+ # Fuzzy Matching
+ fuzzy_match = self.fuzzy_match(cleaned_value)
+ if fuzzy_match:
+ self.remap_dict[value] = fuzzy_match
+ continue
+
+ # Capture anything that wasn't mapped
+ unmapped_values.append(value)
+
+ # AI Model - remap anything unmapped (batch request)
+ ai_mapping = self.ai_standardize(unmapped_values)
+ self.remap_dict.update(ai_mapping)
+
+ return self.remap_dict
+
+ def report_usage(self):
+ """Prints a summary of token usage and cost."""
+ print(f"\n🔹 Total Tokens Used: {self.total_tokens_used}")
+ print(f"💰 Estimated Cost: ${self.total_cost:.4f}")
diff --git a/asset_list/app.py b/asset_list/app.py
new file mode 100644
index 00000000..a284371e
--- /dev/null
+++ b/asset_list/app.py
@@ -0,0 +1,953 @@
+import os
+import json
+import pandas as pd
+from pprint import pprint
+import msgpack
+from utils.s3 import read_from_s3
+from asset_list.AssetList import AssetList
+from asset_list.mappings.property_type import PROPERTY_MAPPING
+from asset_list.mappings.built_form import BUILT_FORM_MAPPINGS
+from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS
+from asset_list.mappings.heating_systems import HEATING_MAPPINGS
+from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS
+from asset_list.mappings.roof import ROOF_CONSTRUCTION_MAPPINGS
+from asset_list.utils import get_data
+
+from dotenv import load_dotenv
+from backend.SearchEpc import SearchEpc
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+
+def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"):
+ if method == "first_two_words":
+ asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
+ return asset_list
+
+ if method == "first_word":
+ asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0]
+ return asset_list
+
+ if method == "house_number_extraction":
+ asset_list["address1_extracted"] = asset_list.apply(
+ lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]),
+ axis=1
+ )
+ return asset_list
+
+ raise ValueError(f"Method {method} not recognized")
+
+
+def app():
+ """
+ This app is EPC pulling data for some properties owned by Livewest
+
+ Data request contents:
+ Date of last EPC
+ Reason for EPC
+ SAP score on register
+ Property Type
+ Property Area
+ Property Age
+ Any Dimensions (HLP,PW,RH)
+ Property Wall Construction
+ Heating Type
+ Secondary Heating
+ Loft Insulation Depth
+
+ Additional if possible:
+ Heat loss calculations
+ EPC recommendations
+ Property UPRN
+ """
+
+ # TODO:
+ # For cavity work:
+ # - Flag any entries that have a different wall type between non-intrusive data against EPC
+ # - Worth double checking entries that have a difference in wall construction
+ # - Look at anything that is flagged as an empty cavity but the EPC data says it’s a filled cavity
+ # - Look at the current EPC scores - Anything that is C75 or above, especially if it’s assumed no insulation
+ # - By postcode, we can try and deduce if all of the addresses are a flats and then estimate if 50% of the flats
+ # are less than C75
+ # - Flag anything pre SAP2012
+ # - Flag anything over 5 years old
+ # - Look at year built vs age band
+ #
+ # For Solar:
+ # - Discount any that have solar PV - based on non-intrusives and from the inspections team
+ # - In the heating, discount anything that isn’t ashp, ghsp, hhrs, electric storage - possibly homes with
+ # electric room heaters but it might need to be an EPC E
+ # - Fabric - check the floor, wall and roof:
+ # - Filled or empty cavity is good
+ # - Insulated solid/timber/system built is good
+ # - SCIS/CEG needs solid floors
+ # - JJC don’t care
+ # - Anything with a loft 200 or below
+ # - Anything C75 and above won’t qualify
+ # - Insulated loft = 200mm
+ # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
+ # - Or the insulation required is loft/cavity (floors should be solid)
+
+ # Bromford
+ data_folder = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme "
+ "Rebuild/Prepared data/")
+ data_filename = "asset_list.xlsx"
+ sheet_name = "Sheet1"
+ postcode_column = 'PostCode'
+ fulladdress_column = "FullAddress"
+ address1_column = None
+ address1_method = "house_number_extraction"
+ address_cols_to_concat = []
+ missing_postcodes_method = None
+ landlord_year_built = "ConYear"
+ landlord_os_uprn = None
+ landlord_property_type = "AssetTypeDesc"
+ landlord_built_form = "PropTypeDesc"
+ landlord_wall_construction = "Construction type"
+ landlord_roof_construction = None
+ landlord_heating_system = "Heating Type"
+ landlord_existing_pv = None
+ landlord_property_id = "Asset"
+ landlord_sap = None
+ outcomes_filename = "outcomes.xlsx"
+ outcomes_sheetname = "Sheet1"
+ outcomes_postcode = "Postcode"
+ outcomes_houseno = "No"
+ outcomes_id = None
+ outcomes_address = "Address"
+ master_filepaths = [
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared data/ECO "
+ "3 submissions.csv",
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared data/ECO "
+ "4 submissions.csv",
+ ]
+ master_to_asset_list_filepath = None
+ phase = False
+
+ # Torus
+ data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Torus/Phase 1"
+ data_filename = "Torus Property Asset List - Phase 1.xlsx"
+ sheet_name = "TORUS"
+ postcode_column = 'Postcode'
+ fulladdress_column = None
+ address1_column = "AddressLine1"
+ address1_method = None
+ address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"]
+ missing_postcodes_method = None
+ landlord_year_built = "Property Age"
+ landlord_os_uprn = "NatUPRN"
+ landlord_property_type = "Property Type"
+ landlord_built_form = "Built Form"
+ landlord_wall_construction = "Wall Construction"
+ landlord_roof_construction = "Roof Construction"
+ landlord_heating_system = "Space Heating Source"
+ landlord_existing_pv = "Low Carbon Technology (Solar PV)"
+ landlord_property_id = "UPRN"
+ landlord_sap = "SAP Score"
+ outcomes_filename = None
+ outcomes_sheetname = None
+ outcomes_postcode = None
+ outcomes_houseno = None
+ outcomes_id = None
+ outcomes_address = None
+ master_filepaths = []
+ master_to_asset_list_filepath = None
+ phase = True
+
+ # Ealing - houses
+ data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Ealing"
+ data_filename = "Ealing_rechecked_cleaned_05042025.csv"
+ sheet_name = None
+ postcode_column = 'Postcode'
+ fulladdress_column = "Address"
+ address1_column = None
+ address1_method = "house_number_extraction"
+ address_cols_to_concat = []
+ missing_postcodes_method = None
+ landlord_year_built = "Year Built"
+ landlord_os_uprn = None
+ landlord_property_type = "Property Type Code"
+ landlord_built_form = None
+ landlord_wall_construction = None
+ landlord_heating_system = None
+ landlord_existing_pv = None
+ landlord_property_id = "Property ref"
+ outcomes_filename = None
+ outcomes_sheetname = None
+ outcomes_postcode = None
+ outcomes_houseno = None
+ outcomes_id = None
+ outcomes_address = None
+ master_filepaths = []
+ master_to_asset_list_filepath = None
+
+ # Southern Midlands
+ data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern/Midlands Properties - Apr 2025"
+ data_filename = "Southern Housing Midlands Property List - combined.xlsx"
+ sheet_name = "Sheet 1"
+ postcode_column = 'Post Code'
+ fulladdress_column = "Address"
+ address1_column = None
+ address1_method = "house_number_extraction"
+ address_cols_to_concat = []
+ missing_postcodes_method = None
+ landlord_year_built = "Age_1"
+ landlord_os_uprn = None
+ landlord_property_type = "Prop_Type"
+ landlord_built_form = "Prop_Type"
+ landlord_wall_construction = "Walls_P"
+ landlord_heating_system = "Heating System"
+ landlord_existing_pv = None
+ landlord_property_id = "AssetID"
+ outcomes_filename = None
+ outcomes_sheetname = None
+ outcomes_postcode = None
+ outcomes_houseno = None
+ outcomes_id = None
+ outcomes_address = None
+ master_filepaths = []
+ master_to_asset_list_filepath = None
+
+ # Live West (2018 Asset list)
+ data_folder = (
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/Programme Update - March 2025/2018 Asset List"
+ )
+ data_filename = "LIVEWEST STOCK - 23rd October 2018.xlsx"
+ sheet_name = "Assets"
+ postcode_column = 'Postcode'
+ fulladdress_column = "Address"
+ address1_column = None
+ address1_method = "house_number_extraction"
+ address_cols_to_concat = []
+ missing_postcodes_method = None
+ landlord_year_built = "Build Year"
+ landlord_os_uprn = None
+ landlord_property_type = "Property Archetype"
+ landlord_built_form = None
+ landlord_wall_construction = None
+ landlord_heating_system = "Heating Fuel Type"
+ landlord_existing_pv = None
+ landlord_property_id = "Uprn - DO NOT DELETE"
+ outcomes_filename = "RT - LiveWest.xlsx"
+ outcomes_sheetname = "Feedback"
+ outcomes_postcode = "Poscode"
+ outcomes_houseno = "No."
+ outcomes_id = "UPRN"
+ master_filepaths = [
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/Programme Update - March 2025/Rolling Master "
+ "- redacted for analysis/CAVITY-Table 1.csv"
+ ]
+ master_to_asset_list_filepath = None
+
+ # Live West (South West asset list)
+ data_folder = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/Programme Update - March "
+ "2025/Livewest Asset List (Original) - csv")
+ data_filename = "Report-Table 1.csv"
+ sheet_name = None
+ postcode_column = 'Postcode'
+ fulladdress_column = "T1_Address"
+ address1_column = None
+ address1_method = "house_number_extraction"
+ address_cols_to_concat = []
+ missing_postcodes_method = None
+ landlord_year_built = "Build Yr"
+ landlord_os_uprn = None
+ landlord_property_type = "T1_AssetType"
+ landlord_built_form = "T1_AssetType"
+ landlord_wall_construction = "Wall Type Cavity"
+ landlord_heating_system = "Heating Fuel"
+ landlord_existing_pv = None
+ landlord_property_id = "T1_UPRN"
+ outcomes_filename = "RT - LiveWest.xlsx"
+ outcomes_sheetname = "Feedback"
+ outcomes_postcode = "Poscode"
+ outcomes_houseno = "No."
+ outcomes_id = "UPRN"
+ master_filepaths = [
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Livewest/Programme Update - March 2025/Rolling Master "
+ "- redacted for analysis/CAVITY-Table 1.csv"
+ ]
+ master_to_asset_list_filepath = None
+
+ # PFP London
+ data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/London"
+ data_filename = "PFP AREAS SURROUNDING LONDON - JAY, RUTH & LANE.xlsx"
+ sheet_name = "PFP SURROUNDING LONDON"
+ postcode_column = 'Postcode'
+ fulladdress_column = None
+ address1_column = "AddressLine1"
+ address1_method = None
+ address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"]
+ missing_postcodes_method = None
+ landlord_year_built = None
+ landlord_os_uprn = None
+ landlord_property_type = "Archetype (PFP)"
+ landlord_built_form = "Archetype (PFP)"
+ landlord_wall_construction = None
+ landlord_heating_system = None
+ landlord_existing_pv = None
+ landlord_property_id = "Uprn"
+ outcomes_filename = None
+ outcomes_sheetname = None
+ outcomes_postcode = None
+ outcomes_houseno = None
+ outcomes_id = None
+ master_filepaths = []
+ master_to_asset_list_filepath = None
+
+ # PFP North-West
+ data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/North-West"
+ data_filename = "Places for People NORTH WEST - INSPECTIONS MASTER - UPDATE.xlsx"
+ sheet_name = "CHECKED"
+ postcode_column = 'Postcode'
+ fulladdress_column = None
+ address1_column = "AddressLine1"
+ address1_method = None
+ address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"]
+ missing_postcodes_method = None
+ landlord_year_built = None
+ landlord_os_uprn = None
+ landlord_property_type = "Archetype (PFP)"
+ landlord_built_form = "Archetype (PFP)"
+ landlord_wall_construction = None
+ landlord_heating_system = None
+ landlord_existing_pv = None
+ landlord_property_id = "Uprn"
+ outcomes_filename = None
+ outcomes_sheetname = None
+ outcomes_postcode = None
+ outcomes_houseno = None
+ outcomes_id = None
+ master_filepaths = []
+ master_to_asset_list_filepath = None
+
+ # PFP North-East
+ data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/North-East"
+ data_filename = "Places for People NORTH EAST - INSPECTIONS MASTER.xlsx"
+ sheet_name = "CHECKED"
+ postcode_column = 'Postcode'
+ fulladdress_column = None
+ address1_column = "AddressLine1"
+ address1_method = None
+ address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"]
+ missing_postcodes_method = None
+ landlord_year_built = None
+ landlord_os_uprn = None
+ landlord_property_type = "Archetype (PFP)"
+ landlord_built_form = "Archetype (PFP)"
+ landlord_wall_construction = None
+ landlord_heating_system = None
+ landlord_existing_pv = None
+ landlord_property_id = "Uprn"
+ outcomes_filename = None
+ outcomes_sheetname = None
+ outcomes_postcode = None
+ outcomes_houseno = None
+ outcomes_id = None
+ master_filepaths = []
+ master_to_asset_list_filepath = None
+
+ # PFP East
+ data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Places For People/East"
+ data_filename = "PFP EAST - Master - DN LN NG NR PE POSTCODES.xlsx"
+ sheet_name = "PFP EAST"
+ postcode_column = 'Postcode'
+ fulladdress_column = None
+ address1_column = "AddressLine1"
+ address1_method = None
+ address_cols_to_concat = ["AddressLine1", "AddressLine2", "AddressLine3"]
+ missing_postcodes_method = None
+ landlord_year_built = None
+ landlord_os_uprn = None
+ landlord_property_type = "Archetype (PFP)"
+ landlord_built_form = "Archetype (PFP)"
+ landlord_wall_construction = None
+ landlord_heating_system = None
+ landlord_existing_pv = None
+ landlord_property_id = "Uprn"
+ outcomes_filename = None
+ outcomes_sheetname = None
+ outcomes_postcode = None
+ outcomes_houseno = None
+ outcomes_id = None
+ master_filepaths = []
+ master_to_asset_list_filepath = None
+
+ # Wates
+ data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Wates - "
+ data_filename = "ECO 4 Wates.xlsx"
+ sheet_name = "Roadmap Homes"
+ postcode_column = 'Postcode'
+ fulladdress_column = None
+ address1_column = "Address Line 1"
+ address1_method = None
+ address_cols_to_concat = ["Address Line 1", "Address Line 2", "Address Line 3"]
+ missing_postcodes_method = None
+ landlord_year_built = "Build Year"
+ landlord_os_uprn = None
+ landlord_property_type = "Archetype"
+ landlord_built_form = "Archetype"
+ landlord_wall_construction = "Wall"
+ landlord_heating_system = "Heating Type"
+ landlord_existing_pv = None
+ landlord_property_id = "UPRN"
+ outcomes_filename = None
+ outcomes_sheetname = None
+ outcomes_postcode = None
+ outcomes_houseno = None
+ master_filepaths = []
+ master_to_asset_list_filepath = None
+
+ # Ealing
+ # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Ealing/Programme data - 04032025"
+ # data_filename = "Ealing BC - Property Plus Tenure 25.02.2025.xlsx"
+ # sheet_name = "IGNORE - FULL MAIN"
+ # postcode_column = 'Postcode'
+ # fulladdress_column = "Address"
+ # address1_column = None
+ # address1_method = "first_word"
+ # address_cols_to_concat = []
+ # missing_postcodes_method = None
+ # landlord_year_built = "Year Built"
+ # landlord_os_uprn = None
+ # landlord_property_type = "Property Type Code"
+ # landlord_wall_construction = None
+ # landlord_heating_system = None
+ # landlord_existing_pv = None
+ # landlord_property_id = "Property ref"
+
+ # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
+ # data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
+ # sheet_name = "Sheet1"
+ # postcode_column = 'Full Address.1'
+ # fulladdress_column = "Full Address"
+ # address1_column = None
+ # address1_method = "first_word"
+ # address_cols_to_concat = []
+ # missing_postcodes_method = None
+ # landlord_year_built = "Build Date"
+ # landlord_os_uprn = None
+ # landlord_property_type = "Property Type"
+ # landlord_wall_construction = "Wallinsul"
+ # landlord_heating_system = "HeatSorc"
+ # landlord_existing_pv = None
+ # landlord_property_id = "Property Reference"
+ # outcomes_filename = None
+ # outcomes_sheetname = None
+ # outcomes_postcode = None
+ # outcomes_houseno = None
+ # master_filepaths = []
+ # master_to_asset_list_filepath = None
+
+ # For Westward
+ data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward"
+ data_filename = "WESTWARD - completed list - 20.03.2025.xlsx"
+ sheet_name = "Sheet1"
+ postcode_column = "WFT EDIT Postcode"
+ fulladdress_column = "Address"
+ address1_column = None
+ address1_method = "house_number_extraction"
+ address_cols_to_concat = []
+ missing_postcodes_method = None
+ landlord_year_built = "Build date"
+ landlord_os_uprn = "UPRN"
+ landlord_property_type = "Location type"
+ landlord_built_form = None
+ landlord_wall_construction = "Wall Construction (EPC)"
+ landlord_heating_system = "Heat Source"
+ landlord_existing_pv = "PV (Y/N)"
+ landlord_property_id = "Place ref"
+ outcomes_filename = None
+ outcomes_sheetname = None
+ outcomes_postcode = None
+ outcomes_houseno = None
+ master_filepaths = []
+ master_to_asset_list_filepath = None
+ outcomes_id = None
+
+ # For ACIS - programme re-build
+ # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/ACIS/ACIS Full Programme Review March 2025"
+ # data_filename = "ACIS asset list.xlsx"
+ # sheet_name = "Assets"
+ # address1_column = "House No"
+ # postcode_column = "Postcode"
+ # landlord_property_id = "UPRN"
+ # fulladdress_column = None
+ # address_cols_to_concat = ["House No", "Street", "Town"]
+ # missing_postcodes_method = None
+ # address1_method = None
+ # landlord_year_built = "YEAR BUILT"
+ # landlord_os_uprn = None
+ # landlord_property_type = "Property type"
+ # landlord_built_form = None
+ # landlord_wall_construction = "Wall Constuction"
+ # landlord_heating_system = "Heating"
+ # landlord_existing_pv = None
+ # outcomes_filename = "ACIS Group - 25.11.2024 - outcomes.xlsx"
+ # outcomes_sheetname = "Feedback"
+ # outcomes_postcode = "Postcode"
+ # outcomes_houseno = "No"
+ # master_filepaths = [
+ # os.path.join(data_folder, "ECO 3 -Table 1.csv"),
+ # os.path.join(data_folder, "ECO 4 -Table 1.csv"),
+ # ]
+ # master_to_asset_list_filepath = None
+
+ # For plus dane
+ data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Plus Dane"
+ data_filename = "PLUS DANE Asset List - for analysis.xlsx"
+ sheet_name = "Asset List"
+ address1_column = " Address"
+ postcode_column = " Postcode"
+ landlord_property_id = "UPRN"
+ fulladdress_column = " Address"
+ address_cols_to_concat = []
+ missing_postcodes_method = None
+ address1_method = None
+ landlord_year_built = "Property Age"
+ landlord_os_uprn = None
+ landlord_property_type = "Property Type"
+ landlord_wall_construction = "Landlord Wall Full"
+ landlord_heating_system = "Landlord Heating"
+ landlord_existing_pv = None
+ outcomes_filename = "plus dane outcomes.xlsx"
+ outcomes_sheetname = "EVERYTHING"
+ outcomes_postcode = "Post Code"
+ outcomes_houseno = "Numb."
+ master_filepaths = [
+ os.path.join(data_folder, "JJC Rolling Master.csv"),
+ os.path.join(data_folder, "SCIS Rolling Master.csv"),
+ ]
+ master_to_asset_list_filepath = os.path.join(data_folder, "surveys_to_assets.csv")
+
+ # Maps addresses to uprn in problematic cases
+ manual_uprn_map = {}
+
+ asset_list = AssetList(
+ local_filepath=os.path.join(data_folder, data_filename),
+ header=0,
+ sheet_name=sheet_name,
+ address1_colname=address1_column,
+ postcode_colname=postcode_column,
+ landlord_property_id=landlord_property_id,
+ full_address_colname=fulladdress_column,
+ full_address_cols_to_concat=address_cols_to_concat,
+ missing_postcodes_method=missing_postcodes_method,
+ address1_extraction_method=address1_method,
+ landlord_year_built=landlord_year_built,
+ landlord_uprn=landlord_os_uprn,
+ landlord_property_type=landlord_property_type,
+ landlord_built_form=landlord_built_form,
+ landlord_wall_construction=landlord_wall_construction,
+ landlord_roof_construction=landlord_roof_construction,
+ landlord_heating_system=landlord_heating_system,
+ landlord_existing_pv=landlord_existing_pv,
+ landlord_sap=landlord_sap,
+ phase=phase
+ )
+ asset_list.init_standardise()
+
+ # We produce the new maps, which can be saved for future useage
+ new_property_type_map = {
+ k: v for k, v in (
+ asset_list.variable_mappings[asset_list.landlord_property_type] if
+ asset_list.landlord_property_type else {}
+ ).items()
+ if k not in PROPERTY_MAPPING
+ }
+ new_built_form_map = {
+ k: v for k, v in (
+ asset_list.variable_mappings[asset_list.landlord_built_form] if
+ asset_list.landlord_built_form else {}
+ ).items()
+ if k not in BUILT_FORM_MAPPINGS
+ }
+ new_wall_map = {
+ k: v for k, v in (
+ asset_list.variable_mappings[asset_list.landlord_wall_construction] if
+ asset_list.landlord_wall_construction else {}
+ ).items()
+ if k not in WALL_CONSTRUCTION_MAPPINGS
+ }
+ new_heating_map = {
+ k: v for k, v in (
+ asset_list.variable_mappings[asset_list.landlord_heating_system] if
+ asset_list.landlord_heating_system else {}
+ ).items()
+ if k not in HEATING_MAPPINGS
+ }
+ new_existing_pv_map = {
+ k: v for k, v in (
+ asset_list.variable_mappings[asset_list.landlord_existing_pv] if asset_list.landlord_existing_pv else {}
+ ).items()
+ if k not in EXISTING_PV_MAPPINGS
+ }
+ new_roof_construction_map = {
+ k: v for k, v in (
+ asset_list.variable_mappings[asset_list.landlord_roof_construction] if
+ asset_list.landlord_roof_construction else {}
+ ).items()
+ if k not in ROOF_CONSTRUCTION_MAPPINGS
+ }
+
+ asset_list.apply_standardiation()
+
+ # We now flag properties that have been treated under existing programmes
+ asset_list.flag_outcomes(
+ outcomes_filepath=os.path.join(data_folder, outcomes_filename) if outcomes_filename else None,
+ outcomes_sheetname=outcomes_sheetname,
+ outcomes_address=outcomes_address,
+ outcomes_postcode=outcomes_postcode,
+ outcomes_houseno=outcomes_houseno,
+ outcomes_id=outcomes_id
+ )
+
+ asset_list.flag_survey_master(
+ master_filepaths=master_filepaths,
+ master_to_asset_list_filepath=master_to_asset_list_filepath
+ )
+
+ ### We retrieve the EPC data
+
+ # We chunk up this data into 5000 rows at a time
+ # Create the chunks directory
+ epc_api_only = False
+ force_retrieve_data = False
+ skip = None # Used to skip already completed chunks
+ chunk_size = 1000
+ filename = "Chunk {i}.csv"
+ download_folder = os.path.join(data_folder, "Chunks")
+ if not os.path.exists(download_folder):
+ os.makedirs(download_folder)
+
+ chunk_indexes = list(range(0, len(asset_list.standardised_asset_list), chunk_size))
+ downloaded_files = {filename.format(i=i) for i in chunk_indexes}
+
+ # We check if we have files associated to these files already and if we do, and we do not want to force the
+ # fetching of the data, we skip
+ folder_contents = os.listdir(download_folder)
+ if all(x in folder_contents for x in downloaded_files):
+ skip = max(chunk_indexes)
+
+ if any(x in folder_contents for x in downloaded_files):
+ skip = max([i for i in chunk_indexes if filename.format(i=i) in folder_contents])
+
+ for i in range(0, len(asset_list.standardised_asset_list), chunk_size):
+ print(f"Processing chunk {i} to {i + chunk_size}")
+ if skip is not None and not force_retrieve_data:
+ if i <= skip:
+ continue
+ chunk = asset_list.standardised_asset_list[i:i + chunk_size]
+ epc_data_chunk, errors_chunk, no_epc_chunk = get_data(
+ df=chunk,
+ row_id_name=asset_list.DOMNA_PROPERTY_ID,
+ uprn_column=AssetList.STANDARD_UPRN,
+ fulladdress_column=AssetList.STANDARD_FULL_ADDRESS,
+ address1_column=AssetList.STANDARD_ADDRESS_1,
+ postcode_column=AssetList.STANDARD_POSTCODE,
+ property_type_column=AssetList.STANDARD_PROPERTY_TYPE,
+ built_form_column=AssetList.STANDARD_BUILT_FORM,
+ manual_uprn_map=manual_uprn_map,
+ epc_api_only=epc_api_only,
+ epc_auth_token=EPC_AUTH_TOKEN
+ )
+
+ # We now retrieve any failed properties
+ chunk_failed = chunk[chunk[asset_list.DOMNA_PROPERTY_ID].isin(errors_chunk)]
+ epc_data_failed, _, _ = get_data(
+ df=chunk_failed,
+ row_id_name=asset_list.DOMNA_PROPERTY_ID,
+ uprn_column=AssetList.STANDARD_UPRN,
+ fulladdress_column=AssetList.STANDARD_FULL_ADDRESS,
+ address1_column=AssetList.STANDARD_ADDRESS_1,
+ postcode_column=AssetList.STANDARD_POSTCODE,
+ property_type_column=AssetList.STANDARD_PROPERTY_TYPE,
+ built_form_column=AssetList.STANDARD_BUILT_FORM,
+ manual_uprn_map=manual_uprn_map,
+ epc_api_only=epc_api_only,
+ epc_auth_token=EPC_AUTH_TOKEN
+ )
+
+ epc_data_chunk.extend(epc_data_failed)
+
+ # Append the failed data to the main data
+ # Store the chunk locally as a csv
+ pd.DataFrame(epc_data_chunk).to_csv(os.path.join(data_folder, f"Chunks/Chunk {i}.csv"), index=False)
+ # Store the errors and no-data locally
+ with open(os.path.join(data_folder, f"Chunks/Chunk {i} errors.json"), "w") as f:
+ json.dump(errors_chunk, f)
+
+ with open(os.path.join(data_folder, f"Chunks/Chunk {i} nodata.csv"), "w") as f:
+ json.dump(no_epc_chunk, f)
+
+ # We read in and concatenate the created created chunks
+ # List the contents
+ epc_data = []
+ for file in downloaded_files:
+ csv_data = pd.read_csv(os.path.join(download_folder, file))
+ # We need to convert the recommendations back to a list
+ csv_data["recommendations"] = csv_data["recommendations"].apply(eval)
+ # We don't have this if we didn't run the pulling from find my epc
+ if "find_my_epc_data" in csv_data.columns:
+ csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval)
+ epc_data.append(csv_data)
+
+ epc_df = pd.concat(epc_data)
+ epc_df["estimated"] = epc_df["estimated"].fillna(False)
+
+ # We expand out the recommendations
+ recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]]
+
+ unique_recommendations = set()
+ for _, row in recommendations_df.iterrows():
+ unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]])
+
+ columns = [asset_list.DOMNA_PROPERTY_ID] + list(unique_recommendations)
+ transformed_data = []
+ for _, row in recommendations_df.iterrows():
+ # Initialize a dictionary for this row with False for all recommendations
+ row_data = {col: False for col in columns}
+ row_data[asset_list.DOMNA_PROPERTY_ID] = row[asset_list.DOMNA_PROPERTY_ID]
+
+ # Set True for each recommendation present in this row
+ for rec in row["recommendations"]:
+ recommendation_text = rec["improvement-summary-text"]
+ row_data[recommendation_text] = True
+
+ # Append the row data to transformed_data
+ transformed_data.append(row_data)
+
+ transformed_df = pd.DataFrame(transformed_data)
+ transformed_df = transformed_df[
+ [
+ asset_list.DOMNA_PROPERTY_ID, "Floor insulation (solid floor)",
+ "Floor insulation", "Floor insulation (suspended floor)"
+ ]
+ ]
+
+ transformed_df["epc_has_floor_recommendation"] = (
+ transformed_df["Floor insulation (solid floor)"] | transformed_df["Floor insulation"] |
+ transformed_df["Floor insulation (suspended floor)"]
+ )
+
+ # Get the find my epc data
+ if "find_my_epc_data" not in epc_df.columns:
+ epc_df["find_my_epc_data"] = None
+
+ find_my_epc_data = []
+ for _, x in epc_df.iterrows():
+ if x["find_my_epc_data"]:
+ find_my_epc_data.append(
+ {
+ asset_list.DOMNA_PROPERTY_ID: x[asset_list.DOMNA_PROPERTY_ID],
+ **x["find_my_epc_data"]
+ }
+ )
+ else:
+ find_my_epc_data.append(
+ {
+ asset_list.DOMNA_PROPERTY_ID: x[asset_list.DOMNA_PROPERTY_ID]
+ }
+ )
+
+ find_my_epc_data = pd.DataFrame(find_my_epc_data)
+
+ find_my_epc_data = find_my_epc_data.merge(
+ transformed_df[[asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"]],
+ how="left", on=asset_list.DOMNA_PROPERTY_ID
+ )
+
+ # We check if we get the solar pv column:
+ if "Solar photovoltaics" not in find_my_epc_data.columns:
+ find_my_epc_data["Solar photovoltaics"] = False
+
+ # Retrieve just the data we need
+ epc_df = epc_df[
+ [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys())
+ ].rename(
+ columns=asset_list.EPC_API_DATA_NAMES
+ )
+
+ # Look for columns not in the find my EPC data, which will have happened if we didn't
+ # retrieve it in the first place
+ missed_find_epc_cols = [c for c in list(asset_list.FIND_EPC_DATA_NAMES.keys()) if c not in find_my_epc_data.columns]
+ if missed_find_epc_cols:
+ for c in missed_find_epc_cols:
+ find_my_epc_data[c] = None
+
+ epc_df = epc_df.merge(
+ find_my_epc_data[
+ [asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + list(asset_list.FIND_EPC_DATA_NAMES.keys())
+ ]
+ .rename(columns=asset_list.FIND_EPC_DATA_NAMES),
+ how="left",
+ on=asset_list.DOMNA_PROPERTY_ID
+ )
+
+ asset_list.merge_data(epc_df)
+
+ asset_list.extract_attributes()
+
+ cleaned = read_from_s3(
+ s3_file_name="cleaned_epc_data/cleaned.bson",
+ bucket_name="retrofit-data-dev"
+ )
+ cleaned = msgpack.unpackb(cleaned, raw=False)
+
+ asset_list.identify_worktypes(cleaned)
+
+ pprint(asset_list.work_type_figures)
+
+ asset_list.flat_analysis()
+
+ ################################################################
+ # WESTWARD - comparison between Kieran's method & automated
+ ################################################################
+
+ # Check 1)
+ cavity_fills = pd.read_excel(
+ os.path.join(data_folder, "WESTWARD - Route March Prep.xlsx"),
+ sheet_name="Straight Fill"
+ )
+ cavity_fills = cavity_fills.merge(
+ asset_list.standardised_asset_list[
+ [asset_list.STANDARD_LANDLORD_PROPERTY_ID, "cavity_reason"]
+ ],
+ how="left",
+ left_on=asset_list.landlord_property_id,
+ right_on=asset_list.STANDARD_LANDLORD_PROPERTY_ID
+ )
+ cavity_fills["cavity_reason"] = cavity_fills["cavity_reason"].fillna("Not identified")
+ print(cavity_fills["cavity_reason"].value_counts())
+ # Didn't identify 3 properties because they're bedsits
+ # 4 properties were identified, not based on the non-intrusives but instead because
+ # Westward said they were built in 2003/2007. Have adjusted this to use the age from the
+ # epc as well, as EPC says 1975 and they look like 1975 properties
+ # 37 properties flagged as already having solar - these are all because the landlord said they have solar
+ # e.g.
+ # https://earth.google.com/web/search/11+Winsland+Avenue+TOTNES+TQ9+5FT/@50.43354465,-3.71318276,46.57468503a,
+ # 59.14004365d,35y,0h,0t,
+ # 0r/data=CpABGmISXAolMHg0ODZkMWQxOGE4NWRiZjdkOjB4YjBhM2E5M2Q3YWVlMWEwYhlZYgp7fzdJQCHFfC9027QNwCohMTEgV2luc2xhbmQgQXZlbnVlIFRPVE5FUyBUUTkgNUZUGAIgASImCiQJbxsQEoo3SUARXQcp_HE3SUAZBmiZGJ6yDcAhCA0fqq63DcBCAggBOgMKATBCAggASg0I____________ARAA
+ # https://earth.google.com/web/search/15+St+Anne%27s+Ct,+Newton+Abbot+TQ12+1TL/@50.53068337,-3.61611128,
+ # 11.74908956a,135.73212429d,35y,0h,0t,
+ # 0r/data=CpUBGmcSYQolMHg0ODZkMDVkMjFhODhjZjgxOjB4MjBmMzE2Zjc3MGI2NGMwYxlCxHLw8UNJQCFZqyzALe4MwComMTUgU3QgQW5uZSdzIEN0LCBOZXd0b24gQWJib3QgVFExMiAxVEwYAiABIiYKJAm-r6U2iDdJQBHS5ICRdDdJQBmYGVpmiLINwCG8wcrtqbYNwEICCAE6AwoBMEICCABKDQj___________8BEAA
+
+ # Check 2)
+ cavity_fills_with_solar = pd.read_excel(
+ os.path.join(data_folder, "WESTWARD - Route March Prep.xlsx"),
+ sheet_name="Solar PV - Straight Fill"
+ )
+ cavity_fills_with_solar = cavity_fills_with_solar.merge(
+ asset_list.standardised_asset_list[
+ [asset_list.STANDARD_LANDLORD_PROPERTY_ID, "cavity_reason"]
+ ],
+ how="left",
+ left_on=asset_list.landlord_property_id,
+ right_on=asset_list.STANDARD_LANDLORD_PROPERTY_ID
+ )
+ cavity_fills_with_solar["cavity_reason"] = cavity_fills_with_solar["cavity_reason"].fillna("Not identified")
+ print(cavity_fills_with_solar["cavity_reason"].value_counts())
+ # 203 properties total
+ # 140 properties were flagged up based on non-intrusives (Non-Intrusive Data Showed Empty Cavity)
+ # 63 property already has solar
+
+ # Check 3) RDF
+ rdf = pd.read_excel(
+ os.path.join(data_folder, "WESTWARD - Route March Prep.xlsx"),
+ sheet_name="RDF CIGA checks"
+ )
+ rdf = rdf.merge(
+ asset_list.standardised_asset_list[
+ [asset_list.STANDARD_LANDLORD_PROPERTY_ID, "cavity_reason", "solar_reason"]
+ ],
+ how="left",
+ left_on=asset_list.landlord_property_id,
+ right_on=asset_list.STANDARD_LANDLORD_PROPERTY_ID
+ )
+ rdf["cavity_reason"] = rdf["cavity_reason"].fillna("Not identified")
+ print(rdf["cavity_reason"].value_counts())
+ # 264 properties are not identified, 261 of which are due to the fact they contain materials
+ # The other 3 were determined to be eligible for solar instead
+ # Many of these units that were identified for rdf works could be solar jobs
+
+ rdf_with_solar = pd.read_excel(
+ os.path.join(data_folder, "WESTWARD - Route March Prep.xlsx"),
+ sheet_name="Solar PV - RDF CIGA Checks"
+ )
+ rdf_with_solar = rdf_with_solar.merge(
+ asset_list.standardised_asset_list[
+ [asset_list.STANDARD_LANDLORD_PROPERTY_ID, "cavity_reason", "solar_reason"]
+ ],
+ how="left",
+ left_on=asset_list.landlord_property_id,
+ right_on=asset_list.STANDARD_LANDLORD_PROPERTY_ID
+ )
+ rdf_with_solar["cavity_reason"] = rdf_with_solar["cavity_reason"].fillna("Not identified")
+ rdf_with_solar["cavity_reason"].value_counts()
+
+ # All others identified - some flagged as empties due to EPC or landlord data suggesting as much
+ # 5 not identified due to containing COMPACTED BEAD
+
+ asset_list.standardised_asset_list = asset_list.standardised_asset_list[
+ asset_list.standardised_asset_list[asset_list.landlord_property_id]
+ ]
+
+ asset_list.load_contact_details(
+ local_filepath=os.path.join(data_folder, "Full property list wth D&V report V look up 12.2.25.xlsx"),
+ sheet_name="Report 1",
+ landlord_property_id=asset_list.landlord_property_id,
+ phone_number_column='Property Current Tel. Number',
+ fullname_column='Proeprty Current Occupant',
+ firstname_column=None,
+ lastname_column=None,
+ email_column=None, # TODO - we need this
+ )
+
+ # Convert to a format suitable for CRM
+ # TODO: TEMP
+ assigned_surveyors = pd.DataFrame(
+ [
+ {
+ asset_list.landlord_property_id: "02610001",
+ "week_commencing": "10/10/2025",
+ "surveyor_name": "Khalim Conn-Kowlessar",
+ "surveyor_email": "khalim@domna.homes",
+ }
+ ]
+ )
+
+ # TODO: Sort the output by postcode
+
+ company_domain = "ealing.gov.uk"
+ crm_pipeline_name = "Survey Management"
+ first_dealstage = "READY TO BEGIN SCHEDULING"
+ # TODO - temp, upload to either SharePoint or AWS
+
+ asset_list.prepare_for_crm(
+ assigned_surveyors=assigned_surveyors,
+ company_domain=company_domain,
+ crm_pipeline_name=crm_pipeline_name,
+ first_dealstage=first_dealstage
+ )
+ hubspot_data = asset_list.hubspot_data
+
+ # Store as an excel
+ filename = os.path.join(data_folder, ".".join(data_filename.split(".")[:-1])) + " - Standardised.xlsx"
+ # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data
+
+ with pd.ExcelWriter(filename) as writer:
+ asset_list.standardised_asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False)
+ asset_list.flat_data.to_excel(writer, sheet_name="Flat Data", index=False)
+ # If we have outcomes, we add a tab with the outcomes
+ if not asset_list.outcomes_for_output.empty:
+ asset_list.outcomes_for_output.to_excel(writer, sheet_name="Outcomes", index=False)
+
+ if not asset_list.unmatched_submissions.empty:
+ asset_list.unmatched_submissions.to_excel(writer, sheet_name="Unmatched Submissions", index=False)
+
+ if not asset_list.outcomes_no_match.empty:
+ asset_list.outcomes_no_match.to_excel(writer, sheet_name="Unmatched Outcomes", index=False)
+
+ # Store the Hubspot export as a csv
+ hubspot_data.to_csv(os.path.join(data_folder, "Hubspot Export.csv"), index=False)
diff --git a/asset_list/mappings/built_form.py b/asset_list/mappings/built_form.py
new file mode 100644
index 00000000..e103f794
--- /dev/null
+++ b/asset_list/mappings/built_form.py
@@ -0,0 +1,148 @@
+import numpy as np
+
+STANDARD_BUILT_FORMS = {
+ "unknown",
+ # Houses
+ "end-terrace", "semi-detached", "detached", "mid-terrace",
+ # Flats
+ "ground floor", "mid-floor", "top-floor", "basement"
+}
+
+BUILT_FORM_MAPPINGS = {
+ 'House (End Terrace)': 'end-terrace',
+ 'Ground Floor Flat General': 'ground floor',
+ 'House (Semi)': 'semi-detached',
+ 'House (Mid Terrace)': 'mid-terrace',
+ 'Bungalow': 'unknown',
+ 'House (Mid terrace)': 'mid-terrace',
+ 'Maisonette': 'unknown',
+ 'Flat': 'unknown',
+ 'First Floor Flat General': 'mid-floor',
+ 'Bungalow (Semi)': 'semi-detached',
+
+ 'Detached House': 'detached',
+ 'End Terraced House': 'end-terrace',
+ 'Studio (Ground floor)': 'ground floor',
+ 'Mid Terraced House': 'mid-terrace',
+ 'Ground Floor Flat': 'ground floor',
+ 'Semi Detached House': 'semi-detached',
+ 'Detached Property': 'detached',
+ 'Level not confirmed': 'unknown',
+ 'Bedsit': 'unknown',
+ 'Cottage': 'detached',
+ 'Terraced House': 'mid-terrace',
+ 'Studio (1st Floor)': 'ground floor',
+ 'Standard Maisonette': 'unknown',
+ 'Third Floor Flat or Above': 'top-floor',
+ 'Town House': 'end-terrace',
+ 'Guest room in a complex': 'unknown',
+ 'Back To Back House': 'mid-terrace',
+ 'PIMSS EMPTY': 'unknown',
+ 'Flat Basement': 'basement',
+ 'House': 'unknown',
+ 'Second Floor Flat': 'mid-floor',
+ 'First Floor Flat': 'ground floor',
+ 'Room Only': 'unknown',
+
+ 'End Terrace Housex': 'end-terrace',
+ 'Mid Terrace Bungalow': 'mid-terrace',
+ 'End Terrace Bungalow': 'end-terrace',
+ 'Mid Terrace House': 'mid-terrace',
+ 'Detached Bungalow': 'detached',
+ 'End Terrace House': 'end-terrace',
+ 'Mid Terrace Housekeeping ': 'mid-terrace',
+ 'Semi Detached Bung': 'semi-detached',
+ 'Guest Room': 'unknown',
+ 'Coach House': 'detached',
+ 'Office Buildings': 'unknown',
+ 'Maisonnette': 'mid-floor',
+ 'Bedspace': 'unknown',
+ 'Studio (3rd floor and above)': 'top-floor',
+ 'Adapted Property For Disabled': 'unknown',
+ 'Studio (2nd floor)': 'mid-floor',
+ np.nan: 'unknown',
+ 'Third Floor Flat': 'mid-floor',
+ '2 Ext. Wall Flat': 'mid-terrace',
+ 'Hostel': 'unknown',
+ 'Flat: Mid Terrace: Mid Floor': 'mid-terrace',
+ 'Bungalow: SemiDetached': 'semi-detached',
+ 'Flat: End Terrace: Top Floor': 'end-terrace',
+ 'Flat: Enclosed End Terrace: Top Floor': 'end-terrace',
+ 'Maisonette: End Terrace: Ground Floor': 'end-terrace',
+ 'Flat: End Terrace: Ground Floor': 'end-terrace',
+ 'Flat: Mid Terrace: Top Floor': 'mid-terrace',
+ 'House: Detached': 'detached',
+ 'Flat: End Terrace: Mid Floor': 'end-terrace',
+ 'House: SemiDetached': 'semi-detached',
+ 'Flat: Semi Detached: Ground Floor': 'semi-detached',
+ 'Flat: Semi Detached: Top Floor': 'semi-detached',
+ 'Flat: Mid Terrace: Ground Floor': 'mid-terrace',
+ 'House: MidTerrace': 'mid-terrace',
+ 'House: EndTerrace': 'end-terrace',
+ 'Bungalow: EndTerrace': 'end-terrace',
+ 'Bungalow: MidTerrace': 'mid-terrace',
+ 'Flat: Semi Detached: Mid Floor': 'semi-detached',
+ 'Maisonette: Mid Terrace: Top Floor': 'mid-terrace',
+ 'Flat: Enclosed Mid Terrace: Mid Floor': 'mid-terrace',
+ 'Flat: Enclosed Mid Terrace: Ground Floor': 'mid-terrace',
+ 'Flat: Detached: Ground Floor': 'detached',
+ 'Flat: Detached: Mid Floor': 'detached',
+ 'Flat: Detached: Top Floor': 'detached',
+ 'Flat: Enclosed End Terrace: Mid Floor': 'end-terrace',
+ 'Bungalow: Detached': 'detached',
+ 'Maisonette: End Terrace: Mid Floor': 'end-terrace',
+ 'Maisonette: Detached: Top Floor': 'detached',
+ 'Flat: Enclosed End Terrace: Ground Floor': 'end-terrace',
+ 'Flat: Enclosed Mid Terrace: Top Floor': 'mid-terrace',
+ 'House: EnclosedEndTerrace': 'end-terrace',
+ '3 Ext. Wall Flat': 'semi-detached',
+ 'Bungalow Detached': 'detached',
+ 'Bungalow End Terrace': 'end-terrace',
+ 'Bungalow Mid Terrace': 'mid-terrace',
+ 'Bungalow Semi Detached': 'detached',
+ 'Maisonette 2 Ext. Wall': 'mid-terrace',
+ 'Maisonette 3 Ext. Wall': 'semi-detached',
+ 'End-terrace': 'end-terrace',
+ 'Mid-terrace': 'mid-terrace',
+ 'Semi-detached': 'semi-detached',
+ 'Detached': 'detached',
+ 'Flat / maisonette': 'unknown',
+ '2014 onwards': 'unknown',
+
+ 'Semi Detached': 'semi-detached',
+ 'End Terraced': 'end-terrace',
+ 'Basement': 'basement',
+ 'No': 'unknown',
+ 'Mid Terrace': 'mid-terrace',
+ 'Link Detached': 'detached',
+ 'Mid Terraced': 'mid-terrace',
+ 'Ground Floor': 'ground floor',
+ 'End Terrace': 'end-terrace',
+ 'Sheltrd Semi Det': 'semi-detached',
+ 'Shop': 'unknown',
+ 'Fourth Floor': 'mid-floor',
+ 'Terraced': 'mid-terrace',
+ 'Leasehold Terr': 'mid-terrace',
+ 'Room': 'unknown',
+ 'Second Floor': 'mid-floor',
+ 'Third Floor': 'mid-floor',
+ 'Office': 'unknown',
+ 'First Floor Over Arch': 'ground floor',
+ '16-25 IND-PPL': 'unknown',
+ 'Seventh Floor': 'top-floor',
+ 'Sheltered': 'unknown',
+ 'Shelt Bung End': 'end-terrace',
+ 'Room In Shared Accommodation': 'unknown',
+ 'Sheltred Bung Terrace': 'mid-terrace',
+ 'Garage In Block': 'unknown',
+ 'First Floor': 'ground floor',
+ 'First Floor Over Garage': 'ground floor',
+ 'Leasehold': 'unknown',
+ 'Sheltred Bung': 'unknown',
+ 'Garage': 'unknown',
+ 'Sixth Floor': 'top-floor',
+ 'Sheltered Bung': 'semi-detached',
+ 'Guest': 'unknown',
+ 'Fifth Floor': 'mid-floor'
+
+}
diff --git a/asset_list/mappings/exising_pv.py b/asset_list/mappings/exising_pv.py
new file mode 100644
index 00000000..51f5f922
--- /dev/null
+++ b/asset_list/mappings/exising_pv.py
@@ -0,0 +1,20 @@
+import numpy as np
+
+STANDARD_EXISTING_PV = {
+ "already has PV", "no PV", "unknown"
+}
+
+EXISTING_PV_MAPPINGS = {
+ "NO": "no PV",
+ "YES": "already has PV",
+ "no": "no PV",
+ "yes": "already has PV",
+ True: "already has PV",
+ False: "no PV",
+ np.nan: 'unknown',
+ 'PV: 2kWp array': 'already has PV',
+ 'PV: 25% roof area, PV: 3.6kWp array': 'already has PV',
+ 'PV: 10% roof area, PV: 2kWp array': 'already has PV',
+ 'PV: 50% roof area': 'already has PV',
+ 'Solar PV': 'already has PV'
+}
diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py
new file mode 100644
index 00000000..7f2f81f2
--- /dev/null
+++ b/asset_list/mappings/heating_systems.py
@@ -0,0 +1,206 @@
+import numpy as np
+
+STANDARD_HEATING_SYSTEMS = {
+ "gas combi boiler",
+ "electric storage heaters",
+ "district heating",
+ "gas condensing boiler",
+ "oil boiler",
+ "gas condensing combi",
+ "air source heat pump",
+ "boiler - other fuel",
+ "ground source heat pump",
+ "electric radiators",
+ "other",
+ "electric boiler",
+ "unknown",
+ "communal gas boiler",
+ "high heat retention storage heaters",
+ "room heaters",
+ 'electric fuel',
+ 'oil fuel',
+ 'solid fuel',
+ 'gas combi boiler',
+ 'unknown',
+ "electric ceiling",
+ "electric underfloor",
+ "no heating"
+}
+
+HEATING_MAPPINGS = {
+ "Combi - GAS": "gas combi boiler",
+ "E7 Storage Heaters": "high heat retention storage heaters",
+ "District heating system": "district heating",
+ "Condensing Boiler - GAS": "gas condensing boiler",
+ "Boiler Oil/other": "oil boiler",
+ "Condensing Combi - Gas": "gas condensing combi",
+ "Air Source Source Heat Pump": "air source heat pump",
+ "Biomass Boiler": "boiler - other fuel",
+ "Ground Source Heat Pump": "ground source heat pump",
+ "Electric Oil filled radiators": "electric radiators",
+ "Solid Fuel": "other",
+ "LPG Boiler": "boiler - other fuel",
+ "Electric Boiler": "electric boiler",
+ "No data": "unknown",
+ "Boiler Communal/Commercial - GAS": "communal gas boiler",
+ "Eco Electric Radiators": "electric radiators",
+ "Gas fire": "other",
+ "Backboiler - Solid fuel": "other",
+ 'combi - gas': 'gas combi boiler',
+ 'e7 storage heaters': 'high heat retention storage heaters',
+ 'district heating system': 'district heating',
+ 'condensing boiler - gas': 'gas condensing boiler',
+ 'boiler oil/other': 'oil boiler',
+ 'condensing combi - gas': 'gas condensing combi',
+ 'air source source heat pump': 'air source heat pump',
+ 'biomass boiler': 'boiler - other fuel',
+ 'ground source heat pump': 'ground source heat pump',
+ 'electric oil filled radiators': 'electric radiators',
+ 'solid fuel': 'other',
+ 'lpg boiler': 'boiler - other fuel',
+ 'electric boiler': 'electric boiler',
+ 'no data': 'unknown', 'boiler communal/commercial - gas': 'communal gas boiler',
+ 'eco electric radiators': 'electric radiators',
+ 'gas fire': 'other', 'backboiler - solid fuel': 'other',
+ 'ASHP': 'air source heat pump',
+ 'COMMHEAT': 'communal gas boiler',
+ 'GBB': 'gas combi boiler',
+ 'GFS': 'gas condensing boiler',
+ 'GWA': 'gas condensing boiler',
+ 'GWM': 'gas condensing combi',
+ 'HDU': 'district heating',
+ 'OILBLR': 'oil boiler',
+ 'SOLIDFUEL': 'boiler - other fuel',
+ 'STORHTR': 'electric storage heaters',
+ np.nan: 'unknown',
+ 'Oil': 'boiler - other fuel',
+ 'Gas': 'gas condensing boiler',
+ 'Electric': 'electric storage heaters',
+ 'Solid fuel': 'other',
+ 'No Heat': 'unknown',
+ 'GSHP': 'ground source heat pump',
+
+ 'Boiler Oil': 'oil boiler',
+ 'Boiler Electricity': 'electric boiler',
+ 'Boiler ND': 'unknown',
+ 'ND Mains gas': 'unknown',
+ 'Room heaters Mains gas': "room heaters",
+ 'Heat pump (air) Electricity': 'air source heat pump',
+ 'Room heaters Electricity': 'electric radiators',
+ 'Room heaters Oil': 'room heaters',
+ 'No heating system ND': 'no heating',
+ 'Heat pump (wet) Electricity': 'ground source heat pump',
+ 'Room heaters Biomass': 'room heaters',
+ 'ND Solid fuel': 'unknown',
+ 'Boiler Mains gas': 'gas combi boiler',
+ 'Boiler LPG': 'boiler - other fuel',
+ 'Room heaters Solid fuel': 'room heaters',
+ 'ND ND': 'unknown',
+ 'Storage heating Electricity': 'electric storage heaters',
+ 'ND Electricity': 'unknown',
+ 'Community heating Community (non-gas)': 'district heating',
+ 'No heating system N/A': 'no heating',
+ 'Boiler Solid fuel': 'boiler - other fuel',
+ 'Community heating Community (mains gas)': 'communal gas boiler',
+ 'Boiler Biomass': 'boiler - other fuel',
+ 'No heating system Mains gas': 'no heating',
+
+ 'Storage heaters': 'electric storage heaters',
+ 'Air Source': 'air source heat pump',
+ 'Ground source': 'ground source heat pump',
+ 'OIl': 'boiler - other fuel',
+ 'Quantum storage heaters (old sh on EPC)': 'high heat retention storage heaters',
+ 'Quanum Storage heaters': 'high heat retention storage heaters',
+ 'Quantum storage heaters (Old SH on EPC)': 'high heat retention storage heaters',
+ 'Quantum storage heaters': 'high heat retention storage heaters',
+ 'Air Source (EPC says SH)': 'air source heat pump',
+ 'ASHP - Was logged as oil': 'air source heat pump',
+ 'Ground Source': 'ground source heat pump',
+ 'District Heating': 'district heating',
+ 'Mains Gas (Communal)': 'communal gas boiler',
+ 'LPG': 'boiler - other fuel',
+ 'Mains Gas': 'gas condensing boiler',
+ 'ELECTRIC': 'electric fuel',
+ 'OIL': 'oil fuel',
+ 'SOLID FUEL': 'solid fuel',
+ 'GAS': 'gas combi boiler',
+ 'DO NOT SURVEY': 'unknown',
+ 'Gas Boiler': 'gas combi boiler',
+ 'Communal Gas ': 'communal gas boiler',
+ 'Communal': 'communal gas boiler',
+ 'Communal Gas': 'communal gas boiler',
+ 'Wood Burning Boiler': "boiler - other fuel",
+ 'Oil Fired Boiler': 'oil boiler',
+ 'Electric (direct acting) room heaters: Panel, convector or radiant heaters Electricity: Electricity': 'room '
+ 'heaters',
+ 'Electric Storage Systems: Integrated storage+direct-acting heater Electricity: Electricity': 'electric storage '
+ 'heaters',
+ 'Community Heating Systems: Community CHP and boilers (RdSAP) Gas: Mains Gas (Community)': 'communal gas boiler',
+ 'Boiler: D rated Regular Boiler Gas: Mains Gas': 'gas boiler',
+ 'Boiler: C rated Combi Gas: Mains Gas': 'gas combi boiler',
+ 'Electric Storage Systems: Fan storage heaters Electricity: Electricity': 'electric storage heaters',
+ ' ': 'unknown',
+ 'Boiler: G rated Regular Boiler Gas: Mains Gas': 'gas boiler',
+ 'Electric Storage Systems: Modern (slimline) storage heaters Electricity: Electricity': 'electric storage heaters',
+ 'Boiler: E rated Regular Boiler Gas: Mains Gas': 'gas boiler',
+ 'Boiler: A rated Regular Boiler Electricity: Electricity': 'electric boiler',
+ 'Community Heating Systems: Community boilers only (RdSAP) Gas: Mains Gas (Community)': 'communal gas boiler',
+ 'Boiler: A rated Combi Gas: Mains Gas': 'gas condensing combi',
+ 'Boiler: A rated CPSU Electricity: Electricity': 'electric boiler',
+ 'Heat Pump: Electric Heat pumps: Ground source heat pump with flow temperature <= 35°C': 'ground source heat pump',
+ 'Heat Pump: Electric Heat pumps: Ground source heat pump in other cases': 'ground source heat pump',
+ 'Electric Storage Systems: High heat retention storage heaters': 'high heat retention storage heaters',
+ 'Heat Pump: Electric Heat pumps: Air source heat pump with flow temperature <= 35°C': 'air source heat pump',
+ 'Electric (direct acting) room heaters: Panel, convector or radiant heaters': 'room heaters',
+ 'Boiler: C rated Combi': 'gas combi boiler',
+ 'Boiler: B rated Regular Boiler': 'gas condensing boiler',
+ 'Boiler: E rated Combi': 'gas combi boiler',
+ 'Boiler: A rated Combi': 'gas combi boiler',
+ 'Boiler: E rated Regular Boiler': 'gas condensing boiler',
+ 'Community Heating Systems: Community boilers only (RdSAP)': 'district heating',
+ 'Boiler: C rated Regular Boiler': 'gas condensing boiler',
+ 'Boiler: A rated Regular Boiler': 'gas condensing boiler',
+ 'Electric Storage Systems: Fan storage heaters': 'electric storage heaters',
+ 'Boiler: F rated Combi': 'gas combi boiler',
+
+ 'Room heaters': 'room heaters',
+ 'Room Heaters': 'room heaters',
+ 'Boiler': 'gas condensing boiler',
+ 'Heat Pump (Wet)': 'air source heat pump',
+ 'Community Heating': 'district heating',
+ 'Heat pump (wet)': 'air source heat pump',
+ 'Electric ceiling heating': 'electric ceiling',
+ 'Electric under floor heating': 'electric underfloor',
+ 'Community heating': 'district heating',
+
+ 'Wet - Radiators Air Source Heat Pump': 'air source heat pump',
+ 'Wet - Radiators Electric': 'electric boiler',
+ 'Storage Heaters': 'high heat retention storage heaters',
+ 'Wet - Radiators Oil': 'oil boiler',
+ 'Communal Wet - Radiators Gas': 'communal gas boiler',
+ 'Electric - Storage/Panel Heaters Electric': 'electric storage heaters',
+ 'Gas Central Heating': 'gas combi boiler',
+ 'Wet - Radiators Solar': 'other',
+ 'Electric - Storage/Panel Heaters LPG': 'electric storage heaters',
+ 'No Heating Solid': 'no heating',
+ 'Wet - Underfloor Gas': 'gas condensing boiler',
+ 'No Heating Electric': 'no heating',
+ 'Oil Fired Central Heating': 'oil boiler',
+ 'Warm Air Gas': 'other',
+ 'Communal Boilers': 'communal gas boiler',
+ 'Wet - Radiators Gas': 'gas combi boiler',
+ 'Wet - Radiators Solid': 'solid fuel',
+ 'Wet - Radiators LPG': 'other',
+ 'No Heating Gas': 'no heating',
+ 'No Heating': 'no heating',
+ 'Panel Heaters': 'electric radiators',
+ 'Rointe Electric Heating': 'electric storage heaters',
+ 'Underfloor Heating': 'electric underfloor',
+ 'Air Source Heating': 'air source heat pump',
+ 'Warm Air Electric': 'other',
+ 'Communal Wet - Radiators Electric': 'communal gas boiler',
+ 'Wet - Underfloor Solar': 'other',
+ 'No Heating Required Gas': 'unknown',
+ 'Electric - Storage/Panel Heaters Gas': 'electric storage heaters',
+ 'Electric - Storage/Panel Heaters Solid': 'electric storage heaters'
+}
diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py
new file mode 100644
index 00000000..dc8dbf21
--- /dev/null
+++ b/asset_list/mappings/property_type.py
@@ -0,0 +1,182 @@
+import numpy as np
+
+# These are the standard categories for property types
+STANDARD_PROPERTY_TYPES = {
+ "house", "flat", "maisonette", "bungalow", "park home", "block house", "bedsit", "coach house",
+ "unknown", "other", "block of flats"
+}
+
+# This is a basic mapping that we use to map values that we've seen commonly to standard values
+PROPERTY_MAPPING = {
+ "HOUSE": "house",
+ "FLAT": "flat",
+ "MAISONET": "maisonette",
+ "BUNGALOW": "bungalow",
+ "BLKHOUS": "block house",
+ "blkhous": "block house",
+ "BEDSIT": "bedsit",
+ "COACHSE": "coach house",
+ "coachse": "coach house",
+ 'Admin Unit Type': 'unknown',
+ 'Block': 'block of flats',
+ 'Bungalow': 'bungalow',
+ 'Flat': 'flat',
+ 'House': 'house',
+ 'Maisonette': 'maisonette',
+ 'Stairwell': 'other',
+ 'MAISON': 'maisonette',
+ '3 Bed Semi Detached House': 'house',
+ '3 Bed Mid Terrace House': 'house',
+ '2 Bed Semi Detached House': 'house',
+ '4 Bed Semi Detached House': 'house',
+ '2 Bed End Terrace House': 'house',
+ '1 Bed Sheltered Bungalow': 'bungalow',
+ '1 Bed 1st Floor Sheltered Flat': 'flat',
+ '2 Bed Second Floor Flat': 'flat',
+ '1 Bed Mid Terrace House': 'house',
+ '1 Bed End Terrace House': 'house',
+ '7 Bed Detached House': 'house',
+ '4 Bed End Terrace House': 'house',
+ '1 Bed Link House': 'house',
+ '1 Bed Second Floor Flat': 'flat',
+ '2 Bed Detached House': 'house',
+ '1 Bed Ground Floor Flat': 'flat',
+ '2 Bed Sheltered Bungalow': 'bungalow',
+ '4 Bed Mid Terrace House': 'house',
+ '2 Bed Mid Terrace House': 'house',
+ '2 Bed First Floor Flat': 'flat',
+ '3 Bed Detached House': 'house',
+ 'Ground Floor Bedsit': 'bedsit',
+ '3 Bed Bungalow': 'bungalow',
+ np.nan: 'unknown',
+ '5 Bed End Terrace House': 'house',
+ '1 Bed Grd Floor Sheltered Flat': 'flat',
+ '3 Bed End Terrace House': 'house',
+ '2 Bed Second Floor Maisonette': 'maisonette',
+ '2 Bed Ground Floor Flat': 'flat',
+ '2 Bed First Floor Maisonette': 'maisonette',
+ '4 Bed Detached House': 'house',
+ '1 Bed Bungalow': 'bungalow',
+ '2 Bed Bungalow': 'bungalow',
+ 'First Floor Bedsit': 'bedsit',
+ '3 Bed First Floor Maisonette': 'maisonette',
+ '2 Bed 1st Floor Sheltered Flat': 'flat',
+ '1 Bed First Floor Flat': 'flat',
+ '3 Bed First Floor Flat': 'flat',
+ 'ND': 'unknown',
+ 'House (Mid Terrace)': 'house',
+ 'First Floor Flat General': 'flat',
+ 'House (End Terrace)': 'house',
+ 'House (Mid terrace)': 'house',
+ 'Bungalow (Semi)': 'bungalow',
+ 'Ground Floor Flat General': 'flat',
+ 'House (Semi)': 'house',
+ 'Detached House': 'house',
+ 'Bedsit': 'bedsit',
+ 'Terraced House': 'house',
+ 'Standard Maisonette': 'maisonette',
+ 'End Terraced House': 'house',
+ 'Third Floor Flat or Above': 'flat',
+ 'Town House': 'house',
+ 'Mid Terraced House': 'house',
+ 'Back To Back House': 'house',
+ 'Flat Basement': 'flat',
+ 'Ground Floor Flat': 'flat',
+ 'Semi Detached House': 'house',
+ 'Second Floor Flat': 'flat',
+ 'First Floor Flat': 'flat',
+ 'Level not confirmed': 'flat',
+ 'Cottage': 'house',
+ 'Studio (1st Floor)': 'flat',
+ 'Studio (Ground floor)': 'flat',
+ 'Guest room in a complex': 'other',
+ 'PIMSS EMPTY': 'bedsit',
+ 'Room Only': 'other',
+ 'Detached Property': 'house',
+ 'End Terrace Housex': 'house',
+ 'Coach House': 'coach house',
+ 'Mid Terrace Bungalow': 'bungalow',
+ 'End Terrace Bungalow': 'bungalow',
+ 'Mid Terrace House': 'house',
+ 'Detached Bungalow': 'bungalow',
+ 'End Terrace House': 'house',
+ 'Mid Terrace Housekeeping ': 'house',
+ 'Maisonnette': 'maisonette',
+ 'Guest Room': 'unknown',
+ 'Office Buildings': 'unknown',
+ 'Semi Detached Bung': 'bungalow',
+ 'Bedspace': 'bedsit',
+ 'Houses/Bungalows': 'bungalow',
+ 'Bedsits': 'bedsit',
+ 'Unknown': 'unknown',
+ 'Sheltered Flats/besits': 'flat',
+ 'House/Bungalow ': 'bungalow',
+ 'Low/Med Rise Flats/Mais': 'flat',
+ 'Staff/Comm': 'other',
+ 'A Rooms': 'other',
+ 'Studio (3rd floor and above)': 'flat',
+ 'Adapted Property For Disabled': 'unknown',
+ 'Studio (2nd floor)': 'flat',
+ 'Third Floor Flat': 'flat',
+ '2 Ext. Wall Flat': 'flat',
+ 'Hostel': 'other',
+ 'House: MidTerrace': 'house',
+ 'House: EndTerrace': 'house',
+ 'Flat: Mid Terrace: Mid Floor': 'flat',
+ 'Bungalow: SemiDetached': 'bungalow',
+ 'Bungalow: EndTerrace': 'bungalow',
+ 'Flat: End Terrace: Top Floor': 'flat',
+ 'Maisonette: End Terrace: Ground Floor': 'maisonette',
+ 'Flat: End Terrace: Ground Floor': 'flat',
+ 'Flat: Mid Terrace: Top Floor': 'flat',
+ 'House: Detached': 'house',
+ 'Flat: End Terrace: Mid Floor': 'flat',
+ 'House: SemiDetached': 'house',
+ 'Flat: Semi Detached: Ground Floor': 'flat',
+ 'Flat: Semi Detached: Top Floor': 'flat',
+ 'Flat: Mid Terrace: Ground Floor': 'flat',
+ 'Bungalow: MidTerrace': 'bungalow',
+ 'Flat: Enclosed End Terrace: Top Floor': 'flat',
+ 'Flat: Semi Detached: Mid Floor': 'flat',
+ 'Maisonette: Mid Terrace: Top Floor': 'maisonette',
+ 'House: EnclosedEndTerrace': 'house',
+ 'Flat: Detached: Ground Floor': 'flat',
+ 'Flat: Detached: Mid Floor': 'flat',
+ 'Flat: Detached: Top Floor': 'flat',
+ 'Bungalow: Detached': 'bungalow',
+ 'Maisonette: End Terrace: Mid Floor': 'maisonette',
+ 'Maisonette: Detached: Top Floor': 'maisonette',
+ 'Flat: Enclosed Mid Terrace: Mid Floor': 'flat',
+ 'Flat: Enclosed Mid Terrace: Ground Floor': 'flat',
+ 'Flat: Enclosed End Terrace: Mid Floor': 'flat',
+ 'Flat: Enclosed End Terrace: Ground Floor': 'flat',
+ 'Flat: Enclosed Mid Terrace: Top Floor': 'flat',
+ '2013 onwards': 'unknown',
+
+ 'House 2 Storey': 'house',
+ 'Bung': 'bungalow',
+ 'House 3 Storey': 'house',
+ 'Shared Flat': 'flat',
+ 'd': 'unknown',
+ 'Mais': 'maisonette',
+ 'e': 'unknown',
+ 'Shared House': 'house',
+ 'House 4 Storey': 'house',
+ 'Shared Bungalow': 'bungalow',
+ 'Detch': 'house',
+ 'Shop': 'other',
+ 'Terr': 'house',
+ 'Terrace': 'house',
+ 'Description': 'unknown',
+ 'Hse': 'house',
+ 'Room': 'other',
+ 'Office': 'other',
+ 'Room In Shared Accommodation': 'other',
+ 'Apartment': 'flat',
+ 'm': 'unknown',
+ 'Garage': 'other',
+ 'Parking Space': 'other',
+ 'Community Centre': 'other',
+ 'Communal Facility': 'other',
+ 'Semi': 'house'
+}
diff --git a/asset_list/mappings/roof.py b/asset_list/mappings/roof.py
new file mode 100644
index 00000000..a95f0529
--- /dev/null
+++ b/asset_list/mappings/roof.py
@@ -0,0 +1,27 @@
+import numpy as np
+
+STANDARD_ROOF_CONSTRUCTIONS = {
+ "pitched access to loft",
+ "pitched no access to loft",
+ "pitched unknown access to loft",
+ "piched unknown insulation",
+ "pitched insulated",
+ "another dwelling above",
+ "flat unknown insulation",
+ "unknown insulated",
+ "unknown",
+}
+
+ROOF_CONSTRUCTION_MAPPINGS = {
+ 'Flat': 'flat unknown insulation',
+ 'Pitched (access to loft)': 'pitched access to loft',
+ 'Pitched (no access to loft)': 'pitched no access to loft',
+ 'Another dwelling above': 'another dwelling above',
+ 'Same dwelling above': 'another dwelling above',
+ 'As-built': 'unknown',
+ 'ND (inferred)': 'unknown',
+ '2018 onwards': 'unknown',
+ 'Pitched (vaulted ceiling)': 'pitched insulated',
+ np.nan: "unknown",
+ None: "unknown"
+}
diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py
new file mode 100644
index 00000000..c327338a
--- /dev/null
+++ b/asset_list/mappings/walls.py
@@ -0,0 +1,170 @@
+import numpy as np
+
+STANDARD_WALL_CONSTRUCTIONS = {
+ # Cavity
+ "uninsulated cavity", "filled cavity", "partial insulated cavity", "cavity unknown insulation",
+ # Solic Brick
+ "uninsulated solid brick", "insulated solid brick", "solid brick unknown insulation",
+ # Timber Frame
+ "timber frame unknown insulation", "insulated timber frame", "uninsulated timber frame",
+ "system built", "granite or whinstone", "other",
+ "unknown", "sandstone or limestone",
+ "cob",
+ "new build - average thermal transmittance",
+}
+
+WALL_CONSTRUCTION_MAPPINGS = {
+ "New Build - Average Thermal Transmittance": "new build - average thermal transmittance",
+ 'Average thermal transmittance 0.25 W/m?K': 'unknown',
+ 'Cavity wall, as built, insulated (assumed)': 'filled cavity',
+ 'Average thermal transmittance 0.31 W/m?K': 'unknown',
+ 'Cavity wall, as built, no insulation (assumed)': 'uninsulated cavity',
+ 'Average thermal transmittance 0.30 W/m?K': 'unknown', 'Average thermal transmittance 0.28 W/m-¦K': 'unknown',
+ 'Average thermal transmittance 0.25 W/m-¦K': 'unknown', 'Average thermal transmittance 0.21 W/m-¦K': 'unknown',
+ 'Average thermal transmittance 0.20 W/m-¦K': 'unknown', 'Average thermal transmittance 0.29 W/m?K': 'unknown',
+ 'Average thermal transmittance 0.16 W/m?K': 'unknown',
+ 'Average thermal transmittance 0.27 W/m²K': 'unknown',
+ 'Average thermal transmittance 0.15 W/m-¦K': 'unknown', 'Average thermal transmittance 0.23 W/m-¦K': 'unknown',
+ 'Average thermal transmittance 0.18 W/m?K': 'unknown',
+ 'Granite or whin, with internal insulation': 'granite or whinstone',
+ "Granite or whinstone, as built, insulated (assumed)": "granite or whinstone",
+ 'Average thermal transmittance 0.22 W/m-¦K': 'unknown', 'Average thermal transmittance 0.24 W/m?K': 'unknown',
+ 'Average thermal transmittance 0.16 W/m-¦K': 'unknown', 'Average thermal transmittance 0.35 W/m?K': 'unknown',
+ 'Average thermal transmittance 0.26 W/m-¦K': 'unknown', 'Average thermal transmittance 0.62 W/m?K': 'unknown',
+ 'Average thermal transmittance 0.64 W/m?K': 'unknown', 'Average thermal transmittance 0.61 W/m?K': 'unknown',
+ 'Sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone',
+ 'Average thermal transmittance 0.33 W/m?K': 'unknown',
+ 'Cavity wall,': "cavity unknown insulation",
+ 'Cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity',
+ 'Average thermal transmittance 0.29 W/m-¦K': 'unknown', 'Average thermal transmittance 0.32 W/m-¦K': 'unknown',
+ 'Average thermal transmittance 0.19 W/m-¦K': 'unknown', 'Average thermal transmittance 0.27 W/m?K': 'unknown',
+ 'Average thermal transmittance 0.22 W/m?K': 'unknown', 'Average thermal transmittance 0.38 W/m?K': 'unknown',
+ 'Average thermal transmittance 0.26 W/m?K': 'unknown', 'Average thermal transmittance 0.27 W/m-¦K': 'unknown',
+ 'Average thermal transmittance 0.18 W/m-¦K': 'unknown', 'Average thermal transmittance = 0.27 W/m?K': 'unknown',
+ 'Cavity wall, with external insulation': 'filled cavity', 'Average thermal transmittance 0.21 W/m?K': 'unknown',
+ 'Average thermal transmittance 0.23 W/m?K': 'unknown', 'Average thermal transmittance 0.20 W/m?K': 'unknown',
+ 'Average thermal transmittance 0.32 W/m?K': 'unknown', 'Average thermal transmittance 0.24 W/m-¦K': 'unknown',
+ 'Cavity wall, with internal insulation': 'filled cavity',
+ 'Average thermal transmittance 0.17 W/m-¦K': 'unknown', 'Average thermal transmittance 0.28 W/m?K': 'unknown',
+ 'new build - average thermal transmittance': 'new build - average thermal transmittance',
+ 'average thermal transmittance 0.25 w/m?k': 'unknown',
+ 'cavity wall, as built, insulated (assumed)': 'filled cavity',
+ 'average thermal transmittance 0.31 w/m?k': 'unknown',
+ 'cavity wall, as built, no insulation (assumed)': 'uninsulated cavity',
+ 'average thermal transmittance 0.30 w/m?k': 'unknown', 'average thermal transmittance 0.28 w/m-¦k': 'unknown',
+ 'average thermal transmittance 0.25 w/m-¦k': 'unknown', 'average thermal transmittance 0.21 w/m-¦k': 'unknown',
+ 'average thermal transmittance 0.20 w/m-¦k': 'unknown', 'average thermal transmittance 0.29 w/m?k': 'unknown',
+ 'average thermal transmittance 0.16 w/m?k': 'unknown', 'average thermal transmittance 0.27 w/m²k': 'unknown',
+ 'average thermal transmittance 0.15 w/m-¦k': 'unknown', 'average thermal transmittance 0.23 w/m-¦k': 'unknown',
+ 'average thermal transmittance 0.18 w/m?k': 'unknown',
+ 'granite or whin, with internal insulation': 'granite or whinstone',
+ 'average thermal transmittance 0.22 w/m-¦k': 'unknown', 'average thermal transmittance 0.24 w/m?k': 'unknown',
+ 'average thermal transmittance 0.16 w/m-¦k': 'unknown', 'average thermal transmittance 0.35 w/m?k': 'unknown',
+ 'average thermal transmittance 0.26 w/m-¦k': 'unknown', 'average thermal transmittance 0.62 w/m?k': 'unknown',
+ 'average thermal transmittance 0.64 w/m?k': 'unknown', 'average thermal transmittance 0.61 w/m?k': 'unknown',
+ 'sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone',
+ 'average thermal transmittance 0.33 w/m?k': 'unknown', 'cavity wall,': "cavity unknown insulation",
+ 'cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity',
+ 'average thermal transmittance 0.29 w/m-¦k': 'unknown', 'average thermal transmittance 0.32 w/m-¦k': 'unknown',
+ 'average thermal transmittance 0.19 w/m-¦k': 'unknown', 'average thermal transmittance 0.27 w/m?k': 'unknown',
+ 'average thermal transmittance 0.22 w/m?k': 'unknown', 'average thermal transmittance 0.38 w/m?k': 'unknown',
+ 'average thermal transmittance 0.26 w/m?k': 'unknown', 'average thermal transmittance 0.27 w/m-¦k': 'unknown',
+ 'average thermal transmittance 0.18 w/m-¦k': 'unknown', 'average thermal transmittance = 0.27 w/m?k': 'unknown',
+ 'cavity wall, with external insulation': 'filled cavity', 'average thermal transmittance 0.21 w/m?k': 'unknown',
+ 'average thermal transmittance 0.23 w/m?k': 'unknown', 'average thermal transmittance 0.20 w/m?k': 'unknown',
+ 'average thermal transmittance 0.32 w/m?k': 'unknown', 'average thermal transmittance 0.24 w/m-¦k': 'unknown',
+ 'cavity wall, with internal insulation': 'filled cavity', 'average thermal transmittance 0.17 w/m-¦k': 'unknown',
+ 'average thermal transmittance 0.28 w/m?k': 'unknown',
+ 'Cavity wall, filled cavity': 'filled cavity',
+ 'Cavity wall, filled cavity and external insulation': 'filled cavity',
+ 'Granite or whinstone, as built, no insulation (assumed)': 'granite or whinstone',
+ 'Solid brick, as built, insulated (assumed)': 'insulated solid brick',
+ 'Solid brick, as built, no insulation (assumed)': 'uninsulated solid brick',
+ 'Solid brick, with external insulation': 'insulated solid brick',
+ 'Solid brick, with internal insulation': 'insulated solid brick',
+ 'System built, as built, insulated (assumed)': 'system built',
+ 'System built, as built, no insulation (assumed)': 'system built',
+ 'System built, with external insulation': 'system built',
+ 'System built, with internal insulation': 'system built',
+ 'Timber frame, as built, insulated (assumed)': 'timber frame',
+ 'Timber frame, as built, no insulation (assumed)': 'timber frame',
+ 'Timber frame, as built, partial insulation (assumed)': 'timber frame',
+ 'Timber frame, with additional insulation': 'timber frame',
+ 'CAVITY': 'cavity unknown insulation',
+ 'COMB': 'unknown',
+ 'NONE': 'unknown',
+ 'NOTKNOWN': 'unknown',
+ 'SOLID': 'solid brick unknown insulation',
+ np.nan: 'unknown',
+ 'RENDER/TIMBER FRAME': 'timber frame',
+ 'SYSTEM BUILT': 'system built',
+ 'PCC PANELS': 'other',
+ 'NOT APPLICABLE - FLAT': 'unknown',
+ 'BRICK/TIMBER FRAME': 'timber frame',
+ 'BRICK/BLOCK CAVITY': 'cavity unknown insulation',
+ 'STONE SOLID': 'sandstone or limestone',
+ 'EXT CLADDING SYSTEM': 'system built',
+ 'BRICK/BLOCK SOLID': 'solid brick unknown insulation',
+
+ 'Cavity Filled cavity (with internal/external)': 'filled cavity',
+ 'ND (inferred) Filled cavity': 'filled cavity',
+ 'Cavity Filled cavity': 'filled cavity',
+ 'Cavity Unknown insulation': 'cavity unknown insulation',
+ 'Timber frame As-built': 'timber frame',
+ 'System build Unknown insulation': 'system built',
+ 'Cavity As-built': 'uninsulated cavity',
+ 'System build External': 'system built',
+ 'ND (inferred) ND (inferred)': 'unknown',
+ 'Solid brick External': 'insulated solid brick',
+ 'Cavity External': 'filled cavity',
+ 'System build As-built': 'system built',
+ 'Solid brick Internal': 'insulated solid brick',
+ 'Cavity Internal': 'filled cavity',
+ 'System build Internal': 'system built',
+ 'Solid brick As-built': 'solid brick unknown insulation',
+
+ 'Cavity ': 'cavity unknown insulation',
+ 'Solid brick ': 'solid brick unknown insulation',
+ 'Timber frame Timber frame (good insulation)': 'insulated timber frame',
+ ' ': 'unknown',
+ 'Cavity No data': 'cavity unknown insulation',
+ 'Non trad ': 'other',
+ 'Solid brick / Multiple Attributes ': 'solid brick unknown insulation',
+ 'Cavity Believe CWI done by Dyson': 'filled cavity',
+ 'Cavity CWI required': 'uninsulated cavity',
+ 'Solid brick EWI installed': 'insulated solid brick',
+ 'Cavity Cavity batts': 'filled cavity',
+ 'Cavity CWI Completed by Dyson': 'filled cavity',
+ None: "unknown",
+ "Cavity": "cavity unknown insulation",
+ 'SolidBrick: Unknown': 'solid brick unknown insulation',
+ 'Cavity: Unknown': 'cavity unknown insulation',
+ 'Cavity: AsBuilt (Post 1995)': 'filled cavity',
+ 'Cavity: AsBuilt (1976-1982)': 'cavity unknown insulation',
+ 'SystemBuilt: AsBuilt': 'system built',
+ 'TimberFrame: AsBuilt': "timber frame unknown insulation",
+ 'Cavity: AsBuilt (1983-1995)': 'cavity unknown insulation',
+ 'Cavity: AsBuilt (1983-1995), Cavity: FilledCavity': 'filled cavity',
+ 'SolidBrick: AsBuilt': 'solid brick unknown insulation',
+ 'Cavity: FilledCavity': 'filled cavity',
+ 'SolidBrick: Internal': 'insulated solid brick',
+ 'Cavity: External': 'filled cavity',
+ 'Sandstone: Internal': 'sandstone or limestone',
+ 'Cavity: AsBuilt (Pre 1976)': 'cavity unknown insulation',
+ 'System build': 'system built',
+ 'Solid brick': 'solid brick unknown insulation',
+ 'Stone': 'sandstone or limestone',
+ 'Timber frame': 'timber frame unknown insulation',
+ '2017 onwards': 'new build - average thermal transmittance',
+ 'ND (inferred)': 'unknown',
+ 'Flat / maisonette': 'other',
+
+ 'Other': 'other',
+ 'Timber Frame': 'timber frame unknown insulation',
+ 'Cavity Wall': 'cavity unknown insulation',
+ 'Non-Traditional': 'system built',
+ 'PRC': 'system built',
+ 'Cross Wall': 'system built',
+ 'Solid Wall': 'solid brick unknown insulation',
+ 'Traditional': 'other'
+}
diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt
new file mode 100644
index 00000000..fd43ac64
--- /dev/null
+++ b/asset_list/requirements.txt
@@ -0,0 +1,12 @@
+postal
+pandas
+usaddress
+pydantic-settings==2.6.0
+epc-api-python==1.0.2
+fuzzywuzzy
+boto3
+openpyxl
+openai
+tiktoken
+msgpack
+beautifulsoup4
\ No newline at end of file
diff --git a/asset_list/tests/test_standardisation.py b/asset_list/tests/test_standardisation.py
new file mode 100644
index 00000000..b6d9a391
--- /dev/null
+++ b/asset_list/tests/test_standardisation.py
@@ -0,0 +1,5 @@
+from asset_list.AssetList import AssetList
+
+
+def test_multi_unit_address_flagging():
+ assert AssetList._identify_multi_address('Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL')
diff --git a/asset_list/utils.py b/asset_list/utils.py
new file mode 100644
index 00000000..ff9db3f8
--- /dev/null
+++ b/asset_list/utils.py
@@ -0,0 +1,183 @@
+import time
+import numpy as np
+import pandas as pd
+from backend.SearchEpc import SearchEpc
+from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
+from tqdm import tqdm
+from utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+def get_data(
+ df,
+ manual_uprn_map,
+ epc_auth_token,
+ uprn_column,
+ fulladdress_column,
+ address1_column,
+ postcode_column,
+ property_type_column,
+ built_form_column,
+ epc_api_only=False,
+ row_id_name="row_id",
+):
+ # These re-map the standard property types to forms accepted by the EPC api, so we can predict EPCs
+ property_type_map = {
+ "house": "House",
+ "flat": "Flat",
+ "maisonette": "Maisonette",
+ "bungalow": "Bungalow",
+ "block house": "House",
+ "coach house": "House",
+ "bedsit": "Flat"
+ }
+
+ built_form_map = {
+ "mid-terrace": "Mid-Terrace",
+ "end-terrace": "End-Terrace",
+ "semi-detached": "Semi-Detached",
+ "detached": "Detached"
+ }
+
+ epc_data = []
+ errors = []
+ no_epc = []
+ for _, home in tqdm(df.iterrows(), total=len(df)):
+ try:
+
+ # If we have a block of flats, we cannot retrieve this data
+ if home.get(property_type_column) == "block of flats":
+ no_epc.append(home[row_id_name])
+ continue
+
+ postcode = home[postcode_column]
+ house_number = str(home[address1_column]).strip()
+ full_address = home[fulladdress_column].strip()
+ house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode)
+ if house_no is None:
+ house_no = house_number
+ uprn = manual_uprn_map.get(full_address, None)
+ if uprn is None and home.get(uprn_column):
+ uprn = home[uprn_column]
+
+ if pd.isnull(uprn):
+ uprn = None
+
+ property_type = property_type_map.get(home.get(property_type_column), None)
+ built_form = built_form_map.get(home.get(built_form_column))
+
+ searcher = SearchEpc(
+ address1=str(house_no),
+ postcode=postcode,
+ auth_token=epc_auth_token,
+ os_api_key="",
+ property_type=None,
+ fast=True,
+ full_address=full_address,
+ max_retries=5,
+ uprn=uprn
+ )
+ # Force the skipping of estimating the EPC
+ searcher.ordnance_survey_client.property_type = None
+ searcher.ordnance_survey_client.built_form = None
+
+ searcher.find_property(skip_os=True)
+
+ # Check if we have a flat or appartment
+ if searcher.newest_epc is None and uprn is None:
+ # Try again:
+ if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None:
+ # Backup
+ add1 = full_address.split(",")
+ if len(add1) > 1:
+ add1 = add1[1].strip()
+ else:
+ # Try splitting on space
+ add1 = full_address.split(" ")[0].strip()
+
+ else:
+ add1 = str(house_number)
+ searcher = SearchEpc(
+ address1=add1,
+ postcode=postcode,
+ auth_token=epc_auth_token,
+ os_api_key="",
+ property_type=None,
+ fast=True,
+ full_address=full_address,
+ max_retries=5
+ )
+
+ if (
+ "flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in
+ house_number.lower()
+ ):
+ searcher.ordnance_survey_client.property_type = "Flat"
+
+ searcher.find_property(skip_os=True)
+
+ # As a final resort, we estimate the EPC
+ if property_type is not None and searcher.newest_epc is None:
+ searcher.ordnance_survey_client.property_type = property_type
+ searcher.ordnance_survey_client.built_form = built_form
+ searcher.find_property(skip_os=True)
+
+ if searcher.newest_epc is None:
+ no_epc.append(home[row_id_name])
+ continue
+
+ # Look for EPC recommendatons
+ try:
+ property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
+ except:
+ property_recommendations = {"rows": []}
+
+ if epc_api_only:
+ epc = {
+ row_id_name: home[row_id_name],
+ **searcher.newest_epc.copy(),
+ "recommendations": property_recommendations["rows"]
+ }
+
+ epc_data.append(epc)
+ continue
+
+ # Retrieve data from FindMyEPC
+ try:
+ find_epc_searcher = RetrieveFindMyEpc(
+ address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"]
+ )
+ find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+ except ValueError as e:
+ if "No EPC found" in str(e) and "address1" in searcher.newest_epc:
+ try:
+ find_epc_searcher = RetrieveFindMyEpc(
+ address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"]
+ )
+ find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+ except ValueError as e:
+ if "No EPC found" in str(e):
+ find_epc_data = {}
+ else:
+ logger.error(f"Error retrieving FindMyEPC data: {e}")
+ raise Exception(f"Error retrieving FindMyEPC data: {e}")
+ else:
+ find_epc_data = {}
+ except Exception as e:
+ raise Exception(f"Error retrieving FindMyEPC data: {e}")
+ time.sleep(np.random.uniform(0.1, 1))
+
+ epc = {
+ row_id_name: home[row_id_name],
+ **searcher.newest_epc.copy(),
+ "recommendations": property_recommendations["rows"],
+ "find_my_epc_data": find_epc_data,
+ }
+
+ epc_data.append(epc)
+ except Exception as e:
+ errors.append(home[row_id_name])
+ time.sleep(5)
+
+ return epc_data, errors, no_epc
diff --git a/backend/Funding.py b/backend/Funding.py
new file mode 100644
index 00000000..f5f85b9f
--- /dev/null
+++ b/backend/Funding.py
@@ -0,0 +1,413 @@
+import pandas as pd
+import numpy as np
+from typing import List
+
+from backend.app.plan.schemas import HousingType
+
+
+class Funding:
+ """
+ Given a property, this class identifies if the home is possibly eligible for funding under
+ the various funding schemes. It will also calculate the expected amount of funding available
+ and flag any tenant specific requirements that need to be considered to the funding to be attained
+ """
+
+ SCHEMES = ["eco4", "gbis", "whlg"]
+
+ ECO_SAP_SCORE_THREHOLDS = [
+ {'Band': 'High_A', 'From': 96.0, 'Up to': 100.0, 'Mid-point': 98.0},
+ {'Band': 'Low_A', 'From': 92.0, 'Up to': 96.0, 'Mid-point': 94.0},
+ {'Band': 'High_B', 'From': 86.0, 'Up to': 91.0, 'Mid-point': 88.5},
+ {'Band': 'Low_B', 'From': 81.0, 'Up to': 86.0, 'Mid-point': 83.5},
+ {'Band': 'High_C', 'From': 74.5, 'Up to': 80.0, 'Mid-point': 77.25},
+ {'Band': 'Low_C', 'From': 69.0, 'Up to': 74.5, 'Mid-point': 71.75},
+ {'Band': 'High_D', 'From': 61.5, 'Up to': 68.0, 'Mid-point': 64.75},
+ {'Band': 'Low_D', 'From': 55.0, 'Up to': 61.5, 'Mid-point': 58.25},
+ {'Band': 'High_E', 'From': 46.5, 'Up to': 54.0, 'Mid-point': 50.25},
+ {'Band': 'Low_E', 'From': 39.0, 'Up to': 46.5, 'Mid-point': 42.75},
+ {'Band': 'High_F', 'From': 29.5, 'Up to': 38.0, 'Mid-point': 33.75},
+ {'Band': 'Low_F', 'From': 21.0, 'Up to': 29.5, 'Mid-point': 25.25},
+ {'Band': 'High_G', 'From': 10.5, 'Up to': 20.0, 'Mid-point': 15.25},
+ {'Band': 'Low_G', 'From': 1.0, 'Up to': 10.5, 'Mid-point': 5.75}
+ ]
+
+ def __init__(
+ self,
+ tenure: HousingType,
+ starting_epc,
+ starting_sap,
+ postcode,
+ floor_area,
+ council_tax_band,
+ property_recommendations,
+ project_scores_matrix,
+ whlg_eligible_postcodes,
+ gbis_abs_rate: int,
+ eco4_abs_rate: int,
+ ):
+ """
+ Use Pydantic to validate the parameter types
+ :param tenure: Indicates if the property is a social or private home
+ :param starting_epc: The current EPC rating of the property
+ :param starting_sap: The current SAP score for the property
+ :param floor_area: The total floor area of the property
+ :param council_tax_band: The council tax band of the property
+ :param property_recommendations: The recommendations for the property
+ :param project_scores_matrix: The matrix of project scores for ECO4
+ :param whlg_eligible_postcodes: The postcodes eligible for WHLG
+ :param gbis_abs_rate: The assumed £/abs achieved by the installer for GBIS
+ :param eco4_abs_rate: The assumed £/abs achieved by the installer for ECO4
+ """
+
+ # TODO: Things we need to include:
+ # 1) Amount of funding
+ # 2) Fundable measures, as a subset of measures may be fundable, not all
+
+ self.tenure = tenure
+ self.starting_epc = starting_epc
+ self.starting_sap = starting_sap
+ self.postcode = postcode
+ self.starting_eco_band = self.sap_to_eco_band(self.starting_sap)
+ self.floor_area_segment = self.classify_floor_area(floor_area)
+ self.gbis_abs_rate = gbis_abs_rate
+ self.eco4_abs_rate = eco4_abs_rate
+ self.council_tax_band = council_tax_band
+
+ self.recommendations = property_recommendations
+
+ self.measure_types = list({r["measure_type"] for r in property_recommendations if r["default"]})
+
+ # Load in the eco4 project scores matrix
+ # Filter the matrix on scores relevant to this property
+ self.project_scores_matrix = project_scores_matrix[
+ (project_scores_matrix["Floor Area Segment"] == self.floor_area_segment) &
+ (project_scores_matrix["Starting Band"] == self.starting_eco_band)
+ ]
+
+ # The postcode column is already lower case
+ self.whlg_eligible_postcodes = whlg_eligible_postcodes[
+ whlg_eligible_postcodes["Postcode"] == self.postcode.lower()
+ ]
+
+ # Store the final outputs
+ self.gbis_eligibiltiy = {}
+ self.eco4_eligibility = {}
+ self.whlg_eligibility = {}
+
+ def output(
+ self,
+ scheme: str,
+ eligible: bool,
+ types: List[str],
+ measure_types: List[str],
+ project_score: float,
+ estimated_funding: float,
+ notify_tenant_benefits_requirements: bool,
+ notify_council_tax_band_requirements: bool,
+ notify_tenant_low_income_requirements: bool,
+ innovation_required: bool,
+ ):
+ """"
+ """
+
+ if scheme not in self.SCHEMES:
+ raise ValueError("Scheme not recognised")
+
+ return {
+ "scheme": scheme,
+ "eligible": eligible,
+ "type": types,
+ "measure_types": measure_types,
+ "project_score": project_score,
+ "estimated_funding": estimated_funding,
+ "requires_benefits": notify_tenant_benefits_requirements,
+ "requires_council_tax_band": notify_council_tax_band_requirements,
+ "requires_low_income": notify_tenant_low_income_requirements,
+ "innovation_required": innovation_required,
+ }
+
+ @staticmethod
+ def classify_floor_area(floor_area):
+ if floor_area <= 72:
+ return "0-72"
+
+ if floor_area <= 97:
+ return "73-97"
+
+ if floor_area <= 199:
+ return "98-199"
+
+ return "200"
+
+ def eco4(self):
+ """
+ Checks if a property is eligible for ECO4
+ :return:
+ """
+ pass
+
+ def find_gbis_measures(self, measures):
+ """
+ The best measure is one that:
+ 1) Creates some SAP movement, therefore enables eligiblity
+ 2) Generates the most funding
+ 3) Has a reasonable ROI
+ :return:
+ """
+ measure_table = pd.DataFrame([
+ m for m in self.recommendations if
+ (m["type"] in measures) or (m["measure_type"] in measures) and m["default"]
+ ])
+
+ measure_table["post_install_sap"] = measure_table["sap_points"] + self.starting_sap
+ # We classify the movement
+ measure_table["Finishing Band"] = np.floor(measure_table["post_install_sap"]).apply(
+ lambda points: self.sap_to_eco_band(points)
+ )
+ # Remove any measures that generate zero SAP movement
+ measure_table = measure_table[measure_table["Finishing Band"] != self.starting_eco_band]
+
+ if measure_table.empty:
+ raise NotImplementedError("No measures available, handle me!")
+
+ # We merge on the project matrix, on post install band
+ measure_table = measure_table.merge(
+ self.project_scores_matrix, how="left", on="Finishing Band"
+ )
+ # Cost Savings is the abs
+ measure_table["estimated_funding"] = measure_table["Cost Savings"] * self.gbis_abs_rate
+ # We cap any estimated funding at the install cost
+ measure_table["estimated_funding"] = np.where(
+ measure_table["estimated_funding"] >= measure_table["total"],
+ measure_table["total"],
+ measure_table["estimated_funding"]
+ )
+
+ # Sort by the measure that will cost the client the least, per sap point
+ measure_table["cost_minus_funding"] = measure_table["total"] - measure_table["estimated_funding"]
+ measure_table["cost_minus_funding_per_sap"] = measure_table["cost_minus_funding"] / measure_table["sap_points"]
+ measure_table = measure_table.sort_values(["cost_minus_funding_per_sap", "total"], ascending=[True, False])
+
+ return measure_table[
+ ["type", "measure_type", "Cost Savings", "estimated_funding"]
+ ].rename(columns={"Cost Savings": "project_score"}).to_dict("records")
+
+ def sap_to_eco_band(self, sap_points):
+ """
+ Giuven a sap point score, this function will classify the points into the SAP half-band
+ :param sap_points:
+ :return:
+ """
+
+ if sap_points > 100:
+ return "High_A"
+
+ classification = [
+ x for x in self.ECO_SAP_SCORE_THREHOLDS if (x["From"] <= sap_points) and (sap_points <= x["Up to"])
+ ]
+
+ if len(classification) != 1:
+ raise Exception("We should have a single classifcation for SAP points to half band")
+
+ return classification[0]['Band']
+
+ def gbis_prs(self):
+ """
+ Checks if a private rental is eligible for GBIS. There are the following possible options
+ 1) General Eligibilty, contigent on EPC D-G and council tax band A-D. Excludes CWI, LI and heating
+ controls
+ 2) Low income group - contigent on EPC D-G and tenant must receive benefits. Excludes heating controls
+ 3) GBIS Flex route 1, 3 - Great British Insulation Scheme Routes 1 and 3 are for pre-installation
+ SAP bands D-G for owner-occupied households, D-E for private rented sector households
+ (Including F & G if exempt from MEES). If houseold is low income. Excludes heating controls
+ 4) GBIS Flex route 2 - EPC E - G and low income household. Excludes heating controls
+
+ Eligible measures:
+ • Solid wall
+ • pitched roof
+ • flat roof
+ • under floor
+ • solid floor park home and
+ • room in-roof insulation
+
+ :return:
+ """
+
+ valid_measures = [
+ "internal_wall_insulation",
+ "external_wall_insulation",
+ "flat_roof_insulation",
+ "suspended_floor_insulation",
+ "room_roof_insulation",
+ # Not available for every eligiblity type
+ "cavity_wall_insulation",
+ "loft_insulation",
+ ]
+
+ # General Eligibility
+ if (
+ (self.starting_epc in ["G", "D", "E", "F"]) and
+ any(
+ [measure in valid_measures for measure in self.measure_types
+ if measure not in ["cavity_wall_insulation", "loft_insulation"]]
+ ) and
+ (self.council_tax_band in [None, "A", "B", "C", "D"])
+ ):
+ # This function pulls out the various measures that can provide funding under GBIS
+ recommended_measures = self.find_gbis_measures(
+ measures=[m for m in valid_measures if m not in ["cavity_wall_insulation", "loft_insulation"]]
+ )
+ # If the council tax band is missing, we nofify the customer that this is a requirement that
+ # should be checked
+ return [
+ self.output(
+ scheme="gbis",
+ eligible=True,
+ types=[m["type"]], # This is single measure so we only have one type
+ measure_types=[m["measure_type"]],
+ project_score=m["project_score"],
+ estimated_funding=m["estimated_funding"],
+ notify_tenant_benefits_requirements=False,
+ notify_council_tax_band_requirements=self.council_tax_band is None,
+ notify_tenant_low_income_requirements=False,
+ innovation_required=False
+ ) for m in recommended_measures
+ ]
+
+ # Low income/flex
+ if (
+ (self.starting_sap in ["G", "D", "E", "F"]) and
+ any([measure in valid_measures for measure in self.measure_types])
+ ):
+ # Find the best measure, and can also include CWI/LI but requires the tenant to be
+ # low inome or on benefits
+ # We find the best measure for GBIS
+ recommended_measures = self.find_gbis_measures(measures=valid_measures)
+ return [
+ self.output(
+ scheme="gbis",
+ eligible=True,
+ types=[m["type"]], # This is single measure so we only have one type
+ measure_types=[m["measure_type"]],
+ project_score=m["project_score"],
+ estimated_funding=m["estimated_funding"],
+ notify_tenant_benefits_requirements=True,
+ notify_council_tax_band_requirements=False,
+ notify_tenant_low_income_requirements=True,
+ innovation_required=False
+ ) for m in recommended_measures
+ ]
+
+ # Otherwise, no funding availability
+ return []
+
+ def gbis_social(self):
+ """
+ Because this is social housing, we have two typical means for eligibility
+ 1) EPC D, where an innovation measure is required
+ 2) EPC G-E, where an innovation measure isn't required
+ :return:
+ """
+ valid_measures = [
+ "internal_wall_insulation",
+ "external_wall_insulation",
+ "flat_roof_insulation",
+ "suspended_floor_insulation",
+ "room_roof_insulation",
+ # Not available for every eligiblity type
+ "cavity_wall_insulation",
+ "loft_insulation",
+ "heating_control"
+ ]
+
+ recommended_measures = self.find_gbis_measures(
+ measures=valid_measures
+ )
+
+ # All measures are available
+ if self.starting_sap == "D":
+ return [
+ self.output(
+ scheme="gbis",
+ eligible=True,
+ types=[m["type"]], # This is single measure so we only have one type
+ measure_types=[m["measure_type"]],
+ project_score=m["project_score"],
+ estimated_funding=m["estimated_funding"],
+ notify_tenant_benefits_requirements=False,
+ notify_council_tax_band_requirements=False,
+ notify_tenant_low_income_requirements=False,
+ innovation_required=True
+ ) for m in recommended_measures
+ ]
+
+ if self.starting_sap in ["G", "F", "E"]:
+ return [
+ self.output(
+ scheme="gbis",
+ eligible=True,
+ types=[m["type"]], # This is single measure so we only have one type
+ measure_types=[m["measure_type"]],
+ project_score=m["project_score"],
+ estimated_funding=m["estimated_funding"],
+ notify_tenant_benefits_requirements=False,
+ notify_council_tax_band_requirements=False,
+ notify_tenant_low_income_requirements=False,
+ innovation_required=False
+ ) for m in recommended_measures
+ ]
+
+ return []
+
+ def gbis(self):
+ """
+ Check if a property is eligible for GBIS
+ :return:
+ """
+
+ if self.tenure == "Private":
+ self.gbis_eligibiltiy = self.gbis_prs()
+ return
+
+ if self.tenure == "Social":
+ self.gbis_eligibiltiy = self.gbis_social()
+
+ raise NotImplementedError("Implement social/oo")
+
+ def whlg(self):
+ if self.tenure == "Social":
+ # We can't do anything for social housing
+ self.whlg_eligibility = []
+ return
+
+ if not self.whlg_eligible_postcodes.empty:
+ raise Exception("Implement me")
+ # self.whlg_eligibility = [
+ # self.output(
+ # scheme,
+ # eligible,
+ # types,
+ # measure_types,
+ # project_score: float,
+ # estimated_funding: float,
+ # notify_tenant_benefits_requirements: bool,
+ # notify_council_tax_band_requirements: bool,
+ # notify_tenant_low_income_requirements: bool,
+ # innovation_required: bool,
+ # )
+ # ]
+
+ def eco4(self):
+ if self.tenure == "Private":
+ self.eco4_eligibiltiy = self.eco4_prs()
+ return
+
+ def check_eligibiltiy(self):
+ """
+ This function instigates the checking process
+ :return:
+ """
+
+ self.gbis()
+ # self.eco4()
+ self.whlg()
diff --git a/backend/Property.py b/backend/Property.py
index 31f207ab..52e8c213 100644
--- a/backend/Property.py
+++ b/backend/Property.py
@@ -22,6 +22,7 @@ from recommendations.recommendation_utils import (
)
from backend.ml_models.AnnualBillSavings import AnnualBillSavings
from backend.app.utils import sap_to_epc
+from backend.Funding import Funding
import backend.app.assumptions as assumptions
ENVIRONMENT = os.environ.get("ENVIRONMENT", "dev")
@@ -69,6 +70,10 @@ class Property:
# Contains the solar panel optimisation results from the Google Solar API
solar_panel_configuration = None
+ # If true, indicates the floor area has actually been given to us by the owner, and we should use this figure
+ # instead of the one in the EPC, when we simulate
+ owner_floor_area = False
+
def __init__(
self,
id,
@@ -103,7 +108,7 @@ class Property:
self.already_installed = ast.literal_eval(already_installed['already_installed']) if already_installed else []
self.non_invasive_recommendations = (
- ast.literal_eval(non_invasive_recommendations['recommendations']) if
+ non_invasive_recommendations['recommendations'] if
non_invasive_recommendations else []
)
# This is a list of measures that have been recommended for the property
@@ -132,9 +137,14 @@ class Property:
self.energy_cost_estimates = {}
self.energy_consumption_estimates = {}
+ # when storing the energy, we'll also
self.energy = {
"primary_energy_consumption": epc_record.get("energy_consumption_current"),
- "co2_emissions": epc_record.get("co2_emissions_current"),
+ "epc_co2_emissions": epc_record.get("co2_emissions_current"),
+ # These will be added in once we estimate the amount of emissions from appliances - using the carbon
+ # intensity of electricity
+ "appliances_co2_emissions": None,
+ "co2_emissions": None
}
self.ventilation = {
"ventilation": epc_record.get("mechanical_ventilation"),
@@ -202,6 +212,11 @@ class Property:
# TODO: We keep this but only temporarily until we add bathrooms, bedrooms, building id to the condition data
self.parse_kwargs(kwargs)
+ # Funding
+ self.gbis_eligibiltiy = None
+ self.eco4_eligibility = None
+ self.whlg_eligibility = None
+
@classmethod
def extract_kwargs(cls, kwargs):
"""
@@ -215,25 +230,24 @@ class Property:
# as we collect more data from the energy assessment
n_bathrooms = kwargs.get("n_bathrooms", None)
- if n_bathrooms not in [None, ""]:
- # We add on a small value to ensure that the number of bathrooms is rounded up, in case the value is 0.5
- n_bathrooms = int(round(float(n_bathrooms) + 1e-5))
+ # We add on a small value to ensure that the number of bathrooms is rounded up, in case the value is 0.5
+ n_bathrooms = int(round(float(n_bathrooms) + 1e-5)) if n_bathrooms not in [None, ""] else None
n_bedrooms = kwargs.get("n_bedrooms", None)
- if n_bedrooms not in [None, ""]:
- n_bedrooms = int(round(float(n_bedrooms) + 1e-5))
+ n_bedrooms = int(round(float(n_bedrooms) + 1e-5)) if n_bedrooms not in [None, ""] else None
number_of_floors = kwargs.get("number_of_floors", None)
- if number_of_floors not in [None, ""]:
- number_of_floors = int(round(float(number_of_floors) + 1e-5))
+ number_of_floors = int(round(float(number_of_floors) + 1e-5)) if number_of_floors not in [None, ""] else None
insulation_floor_area = kwargs.get("insulation_floor_area", None)
- if insulation_floor_area not in [None, ""]:
- insulation_floor_area = float(insulation_floor_area)
+ insulation_floor_area = float(insulation_floor_area) if insulation_floor_area not in [None, ""] else None
insulation_wall_area = kwargs.get("insulation_wall_area", None)
- if insulation_wall_area not in [None, ""]:
- insulation_wall_area = float(insulation_wall_area)
+ insulation_wall_area = float(insulation_wall_area) if insulation_wall_area not in [None, ""] else None
+
+ # We allow for the asset owner to provide us with total floor area, in the event of it being incorrect
+ floor_area = kwargs.get("floor_area", None)
+ floor_area = float(floor_area) if floor_area not in [None, ""] else None
return {
"n_bathrooms": n_bathrooms,
@@ -242,12 +256,15 @@ class Property:
"insulation_floor_area": insulation_floor_area,
"insulation_wall_area": insulation_wall_area,
"building_id": kwargs.get("building_id", None),
+ "floor_area": floor_area
}
def parse_kwargs(self, kwargs):
# We extract the elements from kwargs that we recognise. Anything additional is ignored
for arg, val in kwargs.items():
if val is not None:
+ if arg == "floor_area":
+ self.owner_floor_area = True
setattr(self, arg, val)
def create_base_difference_epc_record(self, cleaned_lookup: dict):
@@ -257,14 +274,7 @@ class Property:
It will be the same starting and ending EPC, as we don't have the expected EPC yet
"""
- # difference_record = self.epc_record - self.epc_record
-
- # TODO: change these lower and replace in the settings file
- # print(
- # "CHANGE THE LATEST FIELD TO REMOVE NUMBER HABITABLE ROOMS IF WE WANT TO USE STARTING/ENDING"
- # )
fixed_data_col_names = MANDATORY_FIXED_FEATURES + LATEST_FIELD
- # print("NEED TO CHANGE THE DASH TO LOWER CASE")
fixed_data_col_names = [
x.lower().replace("_", "-") for x in fixed_data_col_names
]
@@ -275,8 +285,6 @@ class Property:
if k in fixed_data_col_names
}
- # difference_record.append_fixed_data(fixed_data)
-
difference_record = self.epc_record.create_EPCDifferenceRecord(
self.epc_record, fixed_data
)
@@ -285,10 +293,11 @@ class Property:
datasets=[difference_record], cleaned_lookup=cleaned_lookup
)
- # TODO: adjust the base difference record with the previously calculated u values + features
- # estimated_perimeter is different to the perimeter in the epc record
-
- # self.base_difference_record.df
+ # If we have variables that have been given to us by the landlord that we know are correct, whereas the EPC
+ # may not be, we use them
+ if self.owner_floor_area is not None:
+ self.base_difference_record.df["total_floor_area_ending"] = self.floor_area
+ self.base_difference_record.df["estimated_perimeter_ending"] = self.perimeter
def simulate_all_representative_recommendations(
self, property_representative_recommendations,
@@ -374,7 +383,7 @@ class Property:
for rec in property_recommendations_by_phase:
# We simulate the impact of the recommendation at this current phase, and all of the prior phases
- if rec["type"] in ["mechanical_ventilation", "trickle_vents", "draught_proofing"]:
+ if rec["type"] in ["trickle_vents", "draught_proofing"]:
continue
scoring_dict = self.create_recommendation_scoring_data(
@@ -382,8 +391,8 @@ class Property:
recommendation_record=recommendation_record,
recommendations=previous_phase_representatives + [rec],
primary_recommendation_id=rec["recommendation_id"],
- non_invasive_recommendations=self.non_invasive_recommendations,
)
+
self.recommendations_scoring_data.append(scoring_dict)
simulation_epc = self.epc_record.prepared_epc.copy()
@@ -426,6 +435,18 @@ class Property:
if phase_epc_transformation[k] == v:
continue
+ if k == "hotwater-description":
+ if (
+ v == "From main system"
+ ) and (
+ phase_epc_transformation["mainheat-description"] == "Electric storage heaters"
+ ) and (
+ "Electric immersion" in phase_epc_transformation["hotwater-description"]
+ ):
+ # It means we've recommended HHR with electric immersion, and shouldn't overwrite
+ # the hot water description
+ continue
+
raise NotImplementedError(
"Already have this key in the phase_epc_transformation - implement me"
)
@@ -441,7 +462,7 @@ class Property:
if self.simulation_epcs is None:
raise ValueError("Simulation EPCs have not been created")
- rec_ids = sorted(list(self.simulation_epcs.keys()))
+ rec_ids = list(self.simulation_epcs.keys())
updated_simulation_epcs = []
for rec_id in rec_ids:
sim_epc = self.simulation_epcs[rec_id].copy()
@@ -467,15 +488,12 @@ class Property:
# Now we havet this data inthe
self.updated_simulation_epcs = updated_simulation_epcs
- return updated_simulation_epcs
-
@staticmethod
def create_recommendation_scoring_data(
property_id,
recommendation_record,
recommendations: list,
primary_recommendation_id: int,
- non_invasive_recommendations: list = None,
):
"""
This function will iterate through a list of recommendations and apply a simulation for each recommendation
@@ -484,7 +502,6 @@ class Property:
:param recommendation_record: The record of the property, which will be updated
:param recommendations: The list of recommendations to apply
:param primary_recommendation_id: The id of the primary recommendation, which is used to identify the record
- :param non_invasive_recommendations: The list of non-invasive recommendations
:return: The updated recommendation record
"""
@@ -513,7 +530,7 @@ class Property:
"internal_wall_insulation", "external_wall_insulation", "cavity_wall_insulation",
"cylinder_thermostat", "loft_insulation", "room_roof_insulation", "flat_roof_insulation",
"solid_floor_insulation", "suspended_floor_insulation", "mixed_glazing",
- "windows_glazing"
+ "windows_glazing", "mechanical_ventilation"
]:
# We update the data, as defined in the recommendaton
for prefix in ["walls", "roof", "floor"]:
@@ -539,7 +556,7 @@ class Property:
"solid_floor_insulation", "suspended_floor_insulation",
"windows_glazing", "solar_pv", "heating", "hot_water_tank_insulation",
"heating_control", "secondary_heating", "cylinder_thermostat", "mixed_glazing",
- "extension_cavity_wall_insulation",
+ "extension_cavity_wall_insulation", "mechanical_ventilation",
]:
raise NotImplementedError(
"Implement me, given type %s" % recommendation["type"]
@@ -707,6 +724,15 @@ class Property:
"unadjusted": unadjusted_kwh_estimates
}
+ # Update carbon with appliances
+ self.energy["appliances_co2_emissions"] = (
+ (unadjusted_kwh_estimates["appliances"] * assumptions.ELECTRICITY_CARBON_INTENSITY) / 1000
+ )
+ # Re-calculate total CO2 emissions
+ self.energy["co2_emissions"] = float(np.round(
+ self.energy["epc_co2_emissions"] + self.energy["appliances_co2_emissions"], 2
+ ))
+
def set_spatial(self, spatial: pd.DataFrame):
"""
Sets whether the property is in a conservation area given the output of the ConservationAreaClient
@@ -1226,6 +1252,15 @@ class Property:
if (self.building_id is not None) and (self.solar_panel_configuration is not None):
return True
+ # If the property is in a conservation area, is listed or is a heriage building, solar panels
+ # become a difficult measure to generally get through planning restrictions and so we do not recommend
+ # solar panels
+ if self.is_listed or self.is_heritage:
+ # If the property is in a conservation area, we can still recommend solar panels
+ # but they need to be done in a way that is sympathetic to the building. E.g. the panels
+ # may be installed such that they are not visible from the street
+ return False
+
is_valid_property_type = self.data["property-type"] in ["House", "Bungalow", "Maisonette"]
is_valid_roof_type = (
self.roof["is_flat"] or self.roof["is_pitched"] or self.roof["is_roof_room"]
@@ -1294,3 +1329,11 @@ class Property:
)
return electric_consumption
+
+ def insert_funding(self, funding_calulator: Funding):
+ """
+ This method inserts the funding into the property object
+ """
+ self.gbis_eligibiltiy = funding_calulator.gbis_eligibiltiy
+ self.eco4_eligibility = funding_calulator.eco4_eligibility
+ self.whlg_eligibility = funding_calulator.whlg_eligibility
diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py
index 367d8c85..96b7c5de 100644
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@@ -2,6 +2,7 @@ import os
import time
import re
+from urllib.parse import urlencode
import usaddress
import pandas as pd
import numpy as np
@@ -95,7 +96,7 @@ vartypes = {
'walls-env-eff': 'str',
'transaction-type': 'str',
# 'uprn': "Int64",
- 'current-energy-efficiency': 'float',
+ 'current-energy-efficiency': 'Int64',
'energy-consumption-current': 'float',
'mainheat-description': 'str',
'lighting-cost-current': 'float',
@@ -138,8 +139,8 @@ class SearchEpc:
}
NODATA = {
- "status": 201,
- "message": "No data",
+ "status": 204,
+ "message": "no data",
"error": None
}
@@ -154,7 +155,7 @@ class SearchEpc:
uprn: [int, None] = None,
size=None,
property_type=None,
- fast=False
+ fast=False,
):
"""
Address lines 1 and postcode are mandatory fields. The other address lines are optional
@@ -206,10 +207,15 @@ class SearchEpc:
try:
# Updated regex to catch house numbers including alphanumeric ones
- pattern = r'(?i)(?:flat|apartment)\s*(\d+\w*)|^\s*(\d+\w*)'
- match = re.search(pattern, address)
- if match:
- return next(g for g in match.groups() if g is not None)
+ pattern = r'(?i)(?:flat|apartment|room)\s*(\d+\w*)|^\s*(\d+\w*)'
+ match1 = re.search(pattern, address)
+ if match1:
+ return next(g for g in match1.groups() if g is not None)
+
+ pattern2 = r'(?i)(flat|apartment|room)\s*([a-zA-Z]?\d+[a-zA-Z]?)'
+ match2 = re.search(pattern2, address)
+ if match2:
+ return match2.group(2)
parsed = usaddress.parse(address)
# First, try to get the 'OccupancyIdentifier' if 'OccupancyType' is detected
@@ -220,7 +226,8 @@ class SearchEpc:
continue
if part == postcode.split(" ")[1]:
continue
- return part # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary
+ return part.rstrip(",")
+ # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary
# number
# Fallback to 'AddressNumber' if no 'OccupancyIdentifier' is found
@@ -247,46 +254,36 @@ class SearchEpc:
else:
return None
- def get_epc(self, params=None, size=None):
- # Get the EPC data with retries
- size = size if size is not None else self.size
- if params is None:
- if self.uprn:
- params = {"uprn": self.uprn}
- else:
- params = {"address": self.address1, "postcode": self.postcode}
+ def _get_epc(self, params, size):
+ """
+ To be called by get_epc() - not for external usage
+ """
+
+ url = os.path.join(self.client.domestic.host, "search")
+ if size:
+ url += "?" + urlencode({k: v for k, v in {"size": size}.items() if v})
for retry in range(self.max_retries):
try:
- if "uprn" in params:
- # We use the direct call method inside, since we need to implement uprn as a valid
- # parameter for the search function
- url = os.path.join(self.client.domestic.host, "search")
- response = self.client.domestic.call(method="get", url=url, params=params)
- else:
- response = self.client.domestic.search(params=params, size=size)
+ response = self.client.domestic.call(method="get", url=url, params=params)
if response:
self.data = response
- return self.SUCCESS
+ return {
+ "response": response,
+ "msg": self.SUCCESS
+ }
if retry > 0:
logger.info("Failed previous attempt but retry successful")
# If we got nothing, final try
if not response:
return {
- "status": 204,
- "message": "no data",
- "error": None
+ "response": response,
+ "msg": self.NODATA
}
- return {
- "status": 200,
- "message": "success",
- "error": None
- }
-
except Exception as e:
if retry < self.max_retries - 1:
# If not the last retry, wait for 3 seconds before retrying
@@ -294,11 +291,66 @@ class SearchEpc:
else:
# If it's the last retry, we continue
return {
- "status": 500,
- "message": "Could not retrieve EPC data",
- "error": str(e)
+ "response": {},
+ "msg": {
+ "status": 500,
+ "message": "Could not retrieve EPC data",
+ "error": str(e)
+ }
}
+ def get_epc(self, params=None, size=None):
+ # Get the EPC data with retries
+ size = size if size is not None else self.size
+ if params:
+ output = self._get_epc(params=params, size=size)
+ if output["msg"]["status"] == 200:
+ self.data = output["response"]
+ return output["msg"]
+
+ if not self.uprn and not self.address1 and not self.postcode:
+ raise ValueError("No search parameters provided")
+
+ uprn_params = {"uprn": self.uprn} if self.uprn else {}
+ address_params = {}
+ if self.address1:
+ address_params["address"] = self.address1
+ if self.postcode:
+ address_params["postcode"] = self.postcode
+
+ # We attempt the search with uprn params
+
+ data = {"rows": []}
+ api_response = {}
+ if uprn_params:
+ api_response = self._get_epc(params=uprn_params, size=size)
+ if api_response["msg"]["status"] == 200:
+ data["rows"].extend(api_response["response"]["rows"])
+
+ # If we were unsuccessful, we then make a second attempt to fetch the data. We find that
+ # properties are sometimes listed under the wrong UPRN
+ if address_params:
+ api_response = self._get_epc(params=address_params, size=size)
+ if api_response["msg"]["status"] == 200:
+ # We update the data with the correct uprn
+ if self.uprn:
+ for x in api_response["response"]["rows"]:
+ x["uprn"] = self.uprn
+
+ data["rows"].extend(api_response["response"]["rows"])
+
+ # We no de-dupe on lmk-key to avoid duplicates
+ seen = set()
+ data["rows"] = [
+ row for row in data["rows"]
+ if row["lmk-key"] not in seen and not seen.add(row["lmk-key"])
+ ]
+
+ if data["rows"]:
+ api_response["msg"] = self.SUCCESS
+
+ return api_response["msg"]
+
def filter_rows(self, rows, property_type=None, address=None):
"""
This method should not be used when property_type and address are both not None
@@ -343,8 +395,12 @@ class SearchEpc:
rows_filtered = [r for r in rows if ", ".join([r["address"], r["posttown"]]) == best_match[0]]
else:
best_match = process.extractOne(address, [r["address"] for r in rows], score_cutoff=0)
+ # Get the UPRN for the best match
+ best_match_uprn = {r["uprn"] for r in rows if r["address"] == best_match[0]}.pop()
# Get all of the scores
- rows_filtered = [r for r in rows if r["address"] == best_match[0]]
+ rows_filtered = [
+ r for r in rows if (r["address"] == best_match[0]) or (r["uprn"] == best_match_uprn)
+ ]
if rows_filtered:
return rows_filtered
@@ -643,6 +699,7 @@ class SearchEpc:
estimation_data = epc_data[[key, "weight", "lodgement-datetime"]].copy()
estimation_data = estimation_data[~pd.isnull(estimation_data[key])]
estimation_data = estimation_data[~estimation_data[key].isin(Definitions.DATA_ANOMALY_MATCHES)]
+
if vartype == "Int64":
# We have some edge cases where we get the error "invalid literal for int() with base 10: '1.0'"
# so this handles this
@@ -654,6 +711,13 @@ class SearchEpc:
estimated_epc[key] = None
continue
+ if key == "floor-height":
+ # We speficially handle this, to avoid extreme values
+ # We check if we have any rows less than 3.5m
+ if estimation_data[estimation_data["floor-height"].astype(float) <= 3.5].shape[0] > 0:
+ # Perform the filter
+ estimation_data = estimation_data[estimation_data["floor-height"].astype(float) <= 3.5]
+
if vartype == "Int64":
estimated_value = self._estimate_int(estimation_data, key)
elif vartype == "float":
@@ -676,7 +740,30 @@ class SearchEpc:
estimated_epc["current-energy-rating"] = sap_to_epc(estimated_epc["current-energy-efficiency"])
+ # Convert the cost current and potential variables - to string integers
+ for variable in ["heating-cost-current", "hot-water-cost-current", "lighting-cost-current",
+ "heating-cost-potential", "hot-water-cost-potential", "lighting-cost-potential"]:
+ estimated_epc[variable] = str(int(estimated_epc[variable]))
+
+ # This is a string
+ estimated_epc["low-energy-fixed-light-count"] = (
+ str(estimated_epc["low-energy-fixed-light-count"]) if estimated_epc["low-energy-fixed-light-count"] else ""
+ )
+ # This is an int
+ estimated_epc["photo-supply"] = (
+ int(np.round(estimated_epc["photo-supply"])) if estimated_epc["photo-supply"] else estimated_epc[
+ "photo-supply"]
+ )
+
+ estimated_epc["co2-emiss-curr-per-floor-area"] = (
+ estimated_epc["co2-emissions-current"] / estimated_epc["total-floor-area"]
+ )
+
estimated_epc["postcode"] = self.postcode
+ if not self.uprn:
+ # Update self.uprn too
+ self.uprn = hash(self.address1 + self.postcode)
+
estimated_epc["uprn"] = self.uprn
estimated_epc["address"] = self.full_address
# Indicate that this epc was estimated
diff --git a/backend/apis/GoogleSolarApi.py b/backend/apis/GoogleSolarApi.py
index 75f28ceb..cda32faa 100644
--- a/backend/apis/GoogleSolarApi.py
+++ b/backend/apis/GoogleSolarApi.py
@@ -9,8 +9,7 @@ from tqdm import tqdm
from math import sin, cos, sqrt, atan2, radians
from utils.logger import setup_logger
-from recommendations.Costs import Costs, MCS_SOLAR_PV_COST_DATA
-from etl.bill_savings.EnergyConsumptionModel import EnergyConsumptionModel
+from recommendations.Costs import Costs
from backend.ml_models.AnnualBillSavings import AnnualBillSavings
from backend.Property import Property
from backend.app.db.functions.solar_functions import get_solar_data, store_batch_data
@@ -51,6 +50,16 @@ class GoogleSolarApi:
MIN_UNIT_PANELS = 4 # Minimum number of panels we allow for a domestic building
MIN_BUILDING_PANELS = 10 # Minimum number of panels we allow for a block of flats
+ # Max area of a roof space we allow panels for
+ PERCENTAGE_OF_ROOF_LIMIT = 0.8
+
+ # If the roof area that comes back from the solar API is more than 25% larger than the estiamted roof area
+ # that we calcualte based on the property dimensions, we will correct the roof area
+ ROOF_AREA_TOLERANCE = 1.25
+
+ # Error Messages
+ ENTITY_NOT_FOUND_ERROR = 'Requested entity was not found.'
+
def __init__(self, api_key, max_retries=5):
"""
Initialize the GoogleSolarApi class with the provided API key and maximum retries.
@@ -109,6 +118,13 @@ class GoogleSolarApi:
response.raise_for_status() # Raise an error for bad status codes
return response.json()
except requests.exceptions.RequestException as e:
+ if (
+ (e.response.status_code == 404) &
+ (e.response.json()["error"]["message"] == self.ENTITY_NOT_FOUND_ERROR)
+ ):
+ logger.warning("No building insights found for the given location.")
+ return {"error": self.ENTITY_NOT_FOUND_ERROR}
+
attempt += 1
print(f"Attempt {attempt} failed: {e}")
time.sleep(2 ** attempt) # Exponential backoff
@@ -152,6 +168,10 @@ class GoogleSolarApi:
# If we have no data in the db, or updated_at is more than 6 months
if self.insights_data is None or is_outdated:
self.insights_data = self.get_building_insights(longitude, latitude, required_quality)
+ if self.insights_data.get("error") == self.ENTITY_NOT_FOUND_ERROR:
+ # We use default performance since in this case, we couldn't retrieve data. We don't store
+ self.panel_performance = self.default_panel_performance(property_instance=property_instance)
+ return
self.need_to_store = True
# Extract key data from the insights response
@@ -159,12 +179,19 @@ class GoogleSolarApi:
# Automatically exclude north-facing segments
self.exclude_north_facing_segments(property_instance=property_instance)
# If a property is semi-detached, it's possible for us to include segments from an attached unit
- if (property_instance.data["built-form"] == "Semi-Detached") and (
- property_instance.data["extension-count"] == 0
- ):
- self.exclude_likely_duplicate_surfaces()
+ if property_instance is not None:
+ if (property_instance.data["built-form"] == "Semi-Detached") and (
+ property_instance.data["extension-count"] == 0
+ ):
+ self.exclude_likely_duplicate_surfaces()
+ # We constrain the roof area, based on the floor area to be more conservative
self.roof_area = self.insights_data["solarPotential"]["wholeRoofStats"]['areaMeters2']
+ if (
+ self.roof_area > property_instance.roof_area * self.ROOF_AREA_TOLERANCE
+ ) | (self.roof_area < (2 - self.ROOF_AREA_TOLERANCE) * property_instance.roof_area):
+ self.roof_area = property_instance.roof_area
+
self.floor_area = self.insights_data["solarPotential"]["wholeRoofStats"]['groundAreaMeters2']
self.panel_wattage = self.insights_data["solarPotential"]["panelCapacityWatts"]
if self.panel_wattage != 400:
@@ -179,7 +206,9 @@ class GoogleSolarApi:
# We now start finding the solar panel configurations
self.optimise_solar_configuration(
- energy_consumption=energy_consumption, is_building=is_building, property_instance=property_instance
+ energy_consumption=energy_consumption,
+ is_building=is_building,
+ property_instance=property_instance
)
# Finally, if we have a double property, we half the data we stored area
@@ -259,8 +288,6 @@ class GoogleSolarApi:
# minimum is 4
min_panels = self.MIN_BUILDING_PANELS if is_building else self.MIN_UNIT_PANELS
- cost_instance = Costs(property_instance=property_instance) if property_instance is not None else None
-
# Remove any north facing roof segments
panel_performance = []
for config in self.insights_data["solarPotential"].get("solarPanelConfigs", []):
@@ -294,14 +321,12 @@ class GoogleSolarApi:
if roi_summary["n_panels"].sum() < min_panels:
continue
- if cost_instance is None:
- total_cost = MCS_SOLAR_PV_COST_DATA["average_cost_per_kwh"] * (wattage / 1000)
- else:
- total_cost = cost_instance.solar_pv(
- n_panels=roi_summary["n_panels"].sum(),
- has_battery=False,
- n_floors=property_instance.number_of_floors,
- )["total"]
+ total_cost = Costs.solar_pv(
+ n_panels=roi_summary["n_panels"].sum(),
+ has_battery=False,
+ # Assume the most amount of scaffolding
+ n_floors=3 if property_instance is None else property_instance.number_of_floors
+ )["total"]
weighted_ratio = np.average(
roi_summary["ratio"].values, weights=roi_summary["generated_dc_energy"].values
@@ -491,6 +516,11 @@ class GoogleSolarApi:
panel_performance = panel_performance.drop(columns=["n_panels_halved"])
panel_performance = panel_performance[panel_performance["n_panels"] >= min_panels]
+ # Finally, we prevent pannelled roof area being above a limit
+ panel_performance = panel_performance[
+ panel_performance["panneled_roof_area"] <= self.roof_area * self.PERCENTAGE_OF_ROOF_LIMIT
+ ]
+
self.panel_performance = panel_performance
def exclude_north_facing_segments(self, property_instance):
@@ -792,15 +822,19 @@ class GoogleSolarApi:
property_instance = [p for p in input_properties if p.id == unit["property_id"]][0]
# At this level, we check if the property is suitable for solar and if now, skip
# Or if we have a solar non-invasive recommendation
+
+ non_invasive_rec = next(
+ (r for r in property_instance.non_invasive_recommendations if r["type"] == "solar_pv"), {}
+ ).get("array_wattage")
+
if (
(not property_instance.is_solar_pv_valid()) or
- [r for r in property_instance.non_invasive_recommendations if r["type"] == "solar_pv"]
+ non_invasive_rec is not None
):
continue
if unit["longitude"] is None or unit["latitude"] is None:
# At this point, we've checked that solar PV is valid, and so we provide some defaults
-
property_instance.set_solar_panel_configuration(
solar_panel_configuration={
"insights_data": None,
@@ -855,19 +889,19 @@ class GoogleSolarApi:
cost_instance = Costs(property_instance=property_instance)
- # We return a 2.4 and 4 kwp system
+ # We return a 1.6 and 3.2 kwp system
panel_performance = pd.DataFrame(
[
{
- 'n_panels': 10,
- 'yearly_dc_energy': 4000 * 0.99, # Assumed 99% efficient wattage -> dc
+ 'n_panels': 8,
+ 'yearly_dc_energy': 3200 * assumptions.MEDIAN_WATTAGE_TO_DC,
'total_cost': cost_instance.solar_pv(
- n_panels=10, has_battery=False, n_floors=property_instance.number_of_floors
+ n_panels=8, has_battery=False, n_floors=property_instance.number_of_floors
)["total"],
'weighted_ratio': None,
- 'panneled_roof_area': 10 * assumptions.RDSAP_AREA_PER_PANEL,
- 'array_wattage': 4000,
- 'initial_ac_kwh_per_year': 4000 * 0.95, # Assumed 95% efficient wattage -> ac
+ 'panneled_roof_area': 8 * assumptions.RDSAP_AREA_PER_PANEL,
+ 'array_wattage': 3200,
+ 'initial_ac_kwh_per_year': 3200 * assumptions.MEDIAN_WATTAGE_TO_AC,
'lifetime_ac_kwh': None,
'lifetime_dc_kwh': None,
'roi': None,
@@ -879,15 +913,15 @@ class GoogleSolarApi:
'rank': None
},
{
- 'n_panels': 6,
- 'yearly_dc_energy': 2400 * 0.99, # Assumed 99% efficient wattage -> dc
+ 'n_panels': 4,
+ 'yearly_dc_energy': 1600 * assumptions.MEDIAN_WATTAGE_TO_DC,
'total_cost': cost_instance.solar_pv(
n_panels=6, has_battery=False, n_floors=property_instance.number_of_floors
)["total"],
'weighted_ratio': None,
- 'panneled_roof_area': 6 * assumptions.RDSAP_AREA_PER_PANEL,
- 'array_wattage': 2400,
- 'initial_ac_kwh_per_year': 2400 * 0.95, # Assumed 95% efficient wattage -> ac
+ 'panneled_roof_area': 4 * assumptions.RDSAP_AREA_PER_PANEL,
+ 'array_wattage': 1600,
+ 'initial_ac_kwh_per_year': 1600 * assumptions.MEDIAN_WATTAGE_TO_AC,
'lifetime_ac_kwh': None,
'lifetime_dc_kwh': None,
'roi': None,
diff --git a/backend/app/assumptions.py b/backend/app/assumptions.py
index 79f2a087..f1090ef3 100644
--- a/backend/app/assumptions.py
+++ b/backend/app/assumptions.py
@@ -1,7 +1,7 @@
-# Assumes that the average efficiency of an air source heat pump is 250%, taking the median of the 200-400% range,
-# which is often quoted as a sensible efficiency range for air source heat pumps.
+# We assume that the ASHP efficiency is 280%, which is the minimum that Cotswolds Energy Group achieves, as
+# they target this
PESSIMISTIC_ASHP_EFFICIENCY = 200
-AVERAGE_ASHP_EFFICIENCY = 250
+AVERAGE_ASHP_EFFICIENCY = 280
# Conservative estimate of the proportion of electricity that will be consumed, whereas the rest will
# be exported. These are averages based on Google research. E.g
@@ -11,9 +11,15 @@ SOLAR_CONSUMPTION_WITH_BATTERY_PROPORTION = 0.7
# Typically, each solar panel takes up around 3.4 m2 of roof space under RdSAP. This was been verified in Elmhurst
RDSAP_AREA_PER_PANEL = 3.4
+# This is a median based on a sample of properties
+MEDIAN_WATTAGE_TO_AC = 0.965
+MEDIAN_WATTAGE_TO_DC = 0.99
SOCIAL_TENURES = ["Rented (social)", "rental (social)"]
+# Carbon intensity of electricity, as of 16th Jan 2025
+ELECTRICITY_CARBON_INTENSITY = 0.232
+
DESCRIPTIONS_TO_FUEL_TYPES = {
"Air source heat pump, radiators, electric": {
"fuel": "Electricity", "cop": AVERAGE_ASHP_EFFICIENCY / 100
@@ -50,4 +56,12 @@ DESCRIPTIONS_TO_FUEL_TYPES = {
},
"Gas instantaneous at point of use": {"fuel": "Natural Gas", "cop": 0.85},
"Room heaters, wood logs": {"fuel": "Wood Logs", "cop": 1},
+ "Boiler and radiators, coal": {"fuel": "Coal", "cop": 0.85},
+ "From main system, no cylinderstat": {"fuel": "Natural Gas", "cop": 0.85},
}
+
+# These are the measure types where if there is a ventilation recommendation, we force the inclusion of it
+# if one of these has been recommended.
+measures_needing_ventilation = [
+ "internal_wall_insulation", "external_wall_insulation", "cavity_wall_insulation"
+]
diff --git a/backend/app/db/functions/recommendations_functions.py b/backend/app/db/functions/recommendations_functions.py
index d6e41c61..d26adf66 100644
--- a/backend/app/db/functions/recommendations_functions.py
+++ b/backend/app/db/functions/recommendations_functions.py
@@ -138,7 +138,7 @@ def upload_recommendations(session: Session, recommendations_to_upload, property
"recommendation_id": recommendation_id,
"material_id": part["id"],
"depth": int(part["depth"]) if part["depth"] else None,
- "quantity": part["quantity"],
+ "quantity": float(part["quantity"]),
"quantity_unit": part["quantity_unit"],
"estimated_cost": part["total"],
}
diff --git a/backend/app/db/models/materials.py b/backend/app/db/models/materials.py
index f0af3343..9f8abbf4 100644
--- a/backend/app/db/models/materials.py
+++ b/backend/app/db/models/materials.py
@@ -19,6 +19,7 @@ class MaterialType(enum.Enum):
flat_roof_insulation = "flat_roof_insulation"
room_roof_insulation = "room_roof_insulation"
windows_glazing = "windows_glazing"
+ cavity_wall_extraction = "cavity_wall_extraction"
iwi_wall_demolition = "iwi_wall_demolition"
iwi_vapour_barrier = "iwi_vapour_barrier"
diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py
index 119c2061..80a531bf 100644
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@@ -1,3 +1,4 @@
+import ast
import json
from datetime import datetime
@@ -27,9 +28,11 @@ from backend.app.dependencies import validate_token
from backend.app.plan.schemas import PlanTriggerRequest
from backend.app.plan.utils import get_cleaned
from backend.app.utils import epc_to_sap_lower_bound, sap_to_epc
+import backend.app.assumptions as assumptions
from backend.ml_models.api import ModelApi
from backend.Property import Property
+from backend.Funding import Funding
from backend.apis.GoogleSolarApi import GoogleSolarApi
from recommendations.optimiser.CostOptimiser import CostOptimiser
@@ -42,6 +45,7 @@ from backend.ml_models.Valuation import PropertyValuation
from etl.bill_savings.KwhData import KwhData
from etl.spatial.OpenUprnClient import OpenUprnClient
+from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
logger = setup_logger()
@@ -120,7 +124,7 @@ def extract_portfolio_aggregation_data(
# We can now calculate multiple outputs based on default recommendations
carbon_savings = sum([r["co2_equivalent_savings"] for r in default_recommendations])
- pre_retrofit_co2 = p.data["co2-emissions-current"]
+ pre_retrofit_co2 = p.energy["co2_emissions"]
post_retrofit_co2 = pre_retrofit_co2 - carbon_savings
pre_retrofit_energy_bill = sum(p.current_energy_bill.values())
@@ -337,7 +341,10 @@ def extract_property_request_data(
# Because we have some non-invasive recommendations that match on address and postcode, but not UPRN
# we need to check existence of uprn
- has_uprn = "uprn" in non_invasive_recommendations[0] if non_invasive_recommendations else True
+ has_uprn = "uprn" in non_invasive_recommendations[0] if non_invasive_recommendations else False
+ if has_uprn:
+ has_uprn = non_invasive_recommendations[0]["uprn"] not in ["", None]
+
if has_uprn:
property_non_invasive_recommendations = next((
x for x in non_invasive_recommendations if
@@ -352,7 +359,6 @@ def extract_property_request_data(
), {})
if isinstance(property_non_invasive_recommendations.get("recommendations"), str):
- import ast
property_non_invasive_recommendations["recommendations"] = ast.literal_eval(
property_non_invasive_recommendations["recommendations"]
)
@@ -363,16 +369,49 @@ def extract_property_request_data(
else:
transformed.append(rec)
- property_non_invasive_recommendations["recommendations"] = str(transformed)
+ property_non_invasive_recommendations["recommendations"] = transformed
- property_valution = next((
- float(x["value"]) for x in valuation_data if
- (str(x["uprn"]) == str(uprn))
- ), None)
+ # Check if the valuation data has uprn
+ valuation_has_uprn = "uprn" in valuation_data[0] if valuation_data else False
+ if valuation_has_uprn:
+ valuation_has_uprn = valuation_data[0]["uprn"] not in ["", None]
+
+ if valuation_has_uprn:
+ property_valution = next((
+ float(x["valuation"]) for x in valuation_data if
+ (str(x["uprn"]) == str(uprn))
+ ), None)
+ else:
+ property_valution = next((
+ float(x["valuation"]) for x in valuation_data if
+ (x["address"] == config["address"]) and (x["postcode"] == config["postcode"])
+ ), None)
return patch, property_already_installed, property_non_invasive_recommendations, property_valution
+def get_funding_data():
+ """
+ This function retrieves the eco project scores matrix and the warm homes local grant funding data
+ :return:
+ """
+ project_scores_matrix = read_csv_from_s3(
+ bucket_name=get_settings().DATA_BUCKET,
+ filepath="funding/ECO4 Full Project Scores Matrix.csv",
+ )
+ project_scores_matrix = pd.DataFrame(project_scores_matrix)
+ project_scores_matrix.columns = ['Floor Area Segment', 'Starting Band', 'Finishing Band', 'Cost Savings']
+ project_scores_matrix["Cost Savings"] = project_scores_matrix["Cost Savings"].astype(float)
+
+ whlg_eligible_postcodes = read_csv_from_s3(
+ bucket_name=get_settings().DATA_BUCKET,
+ filepath="funding/whlg eligible postcodes.csv",
+ )
+ whlg_eligible_postcodes = pd.DataFrame(whlg_eligible_postcodes)
+
+ return project_scores_matrix, whlg_eligible_postcodes
+
+
router = APIRouter(
prefix="/plan",
tags=["plan"],
@@ -393,6 +432,14 @@ async def trigger_plan(body: PlanTriggerRequest):
session.begin()
logger.info("Getting the inputs")
plan_input = read_csv_from_s3(bucket_name=get_settings().PLAN_TRIGGER_BUCKET, filepath=body.trigger_file_path)
+ # Check for duplicate UPRNS
+ input_uprns = [x.get("uprn") for x in plan_input if "uprn" in x and x.get("uprn")]
+
+ if input_uprns:
+ # Check for dupes
+ if len(input_uprns) != len(set(input_uprns)):
+ raise ValueError("Duplicate UPRNs in the input data")
+
# If we have patches or overrides, we should read them in here
patches, already_installed, non_invasive_recommendations, valuation_data = get_request_property_data(body)
@@ -424,13 +471,22 @@ async def trigger_plan(body: PlanTriggerRequest):
# Create a record in db
property_id, is_new = create_property(
- session, body.portfolio_id, epc_searcher.address_clean, epc_searcher.postcode_clean,
- epc_searcher.uprn,
- energy_assessment
+ session=session,
+ portfolio_id=body.portfolio_id,
+ address=epc_searcher.address_clean,
+ postcode=epc_searcher.postcode_clean,
+ uprn=epc_searcher.uprn,
+ energy_assessment=energy_assessment
)
if not is_new and not body.multi_plan:
continue
+ if epc_searcher.newest_epc is None:
+ raise ValueError(
+ "No EPCs found for this property and did not estimate - likely need to provide a"
+ "property type and built form"
+ )
+
if is_new:
create_property_targets(
session,
@@ -459,6 +515,14 @@ async def trigger_plan(body: PlanTriggerRequest):
)
)
+ # if we have a remote assment data type, we pull the additional data and include it
+ if body.event_type == "remote_assessment":
+ logger.info("Retrieving find my epc data")
+ property_non_invasive_recommendations = RetrieveFindMyEpc.get_from_epc(
+ epc_searcher.newest_epc
+ )
+ # TODO: We need to determine if we should make a patch, if the EPC is new
+
epc_records = patch_epc(patch, epc_records)
prepared_epc = EPCRecord(
@@ -489,7 +553,8 @@ async def trigger_plan(body: PlanTriggerRequest):
model_api = ModelApi(
portfolio_id=body.portfolio_id,
timestamp=created_at,
- prediction_buckets=get_prediction_buckets()
+ prediction_buckets=get_prediction_buckets(),
+ max_retries=1
)
await model_api.async_warm_up_lambdas(
model_prefies=model_api.KWH_MODEL_PREFIXES + model_api.MODEL_PREFIXES
@@ -501,6 +566,7 @@ async def trigger_plan(body: PlanTriggerRequest):
logger.info("Reading in materials and cleaned datasets")
materials = get_materials(session)
cleaned = get_cleaned()
+ eco_project_scores_matrix, whlg_eligible_postcodes = get_funding_data()
kwh_client = KwhData(bucket=get_settings().DATA_BUCKET, read_consumption_data=True)
@@ -584,8 +650,10 @@ async def trigger_plan(body: PlanTriggerRequest):
recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data)
recommendations_scoring_data = recommendations_scoring_data.drop(
- columns=["rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending",
- "carbon_ending"]
+ columns=[
+ "rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending",
+ "carbon_ending"
+ ]
)
all_predictions = await model_api.async_paginated_predictions(
@@ -604,6 +672,7 @@ async def trigger_plan(body: PlanTriggerRequest):
property_instance=property_instance,
all_predictions=all_predictions,
recommendations=recommendations,
+ representative_recommendations=representative_recommendations
)
)
@@ -625,8 +694,6 @@ async def trigger_plan(body: PlanTriggerRequest):
)
# We now insert kwh estimates and costs into the recommendations
- # TODO: We should join the methodology which maps the heating and hot water descriptions to the fuel types in
- # Recommendations, but also the Property class
logger.info("Calculating tenant savings - kwh and bills")
for property_id in tqdm([p.id for p in input_properties]):
property_recommendations = recommendations.get(property_id, [])
@@ -636,59 +703,130 @@ async def trigger_plan(body: PlanTriggerRequest):
Recommendations.calculate_recommendation_tenant_savings(
property_instance=property_instance,
kwh_simulation_predictions=kwh_simulation_predictions,
- property_recommendations=property_recommendations
+ property_recommendations=property_recommendations,
+ ashp_cop=body.ashp_cop
)
)
property_instance.current_energy_bill = property_current_energy_bill
# Insert the predictions into the recommendations and run the optimiser
- # TODO: If a recommendation has a negative impact on SAP, we should remove it - this seems to have become a
- # possibility with heating system
- # TODO: After optimising, if there are any cheap, quick win measures (e.g. insulate water tank with hot water
- # cylinder jacket), we should add these to the recommendations as default
-
for p in input_properties:
if not recommendations.get(p.id):
continue
- input_measures = prepare_input_measures(recommendations[p.id], body.goal)
+ # we need to double unlist because we have a list of lists
+ property_measure_types = {rec["type"] for recs in recommendations[p.id] for rec in recs}
- current_sap_points = int(p.data["current-energy-efficiency"])
- target_sap_points = epc_to_sap_lower_bound(body.goal_value)
- sap_gain = CostOptimiser.calculate_sap_gain_with_slack(target_sap_points - current_sap_points)
+ property_required_measures = [
+ m for m in recommendations[p.id] if m[0]["type"] in body.required_measures
+ ]
+ measures_to_optimise = [
+ m for m in recommendations[p.id] if m[0]["type"] not in body.required_measures
+ ]
- if not body.optimise:
- if body.goal != "Increasing EPC":
- raise NotImplementedError("Only EPC optimisation is currently supported")
+ # If we have a wall insulation measure, we MUST include mechanical ventilation
+ # Additionally, if we have required measures, they should also be included. Therefore
+ # we can discount the number of points required to get to the target SAP band (or increase)
+ # in the case of ventilation
+ needs_ventilation = any(x in property_measure_types for x in assumptions.measures_needing_ventilation)
+
+ input_measures = prepare_input_measures(measures_to_optimise, body.goal, needs_ventilation)
+
+ if not input_measures[0]:
+ # This means that we have no defaults
+ selected_recommendations = {}
solution = []
- for sub_list in input_measures:
- # Select the entry with the highest gain, and if tied, choose the one with the lowest cost
- best_measure = max(sub_list, key=lambda x: (x['gain'], -x['cost']))
- solution.append(best_measure)
else:
- if body.budget:
- optimiser = GainOptimiser(
- input_measures, max_cost=body.budget, max_gain=sap_gain if sap_gain > 0 else 0
+ fixed_gain = 0
+ if property_required_measures:
+ # We get the SAP points for the required measures
+ if body.goal != "Increasing EPC":
+ raise NotImplementedError("Only EPC optimisation is currently supported")
+ sap_by_type = [
+ {"type": rec["type"], "sap_points": rec["sap_points"]} for recs in property_required_measures
+ for rec in recs
+ ]
+ # We get a MAX sap points per type
+ max_per_type = (
+ pd.DataFrame(sap_by_type).groupby("type")["sap_points"].max().to_dict()
)
+ fixed_gain = sum(max_per_type.values())
+
+ property_required_measure_types = {rec["type"] for rec in sap_by_type}
+
+ # if the property needs ventilation, but the measure we optimise didn't include
+ # venilation we add the points for ventilation as a fixed gain
+ if needs_ventilation and any(
+ r in property_required_measure_types for r in assumptions.measures_needing_ventilation
+ ):
+ fixed_gain += next(
+ (r[0]["sap_points"] for r in recommendations[p.id] if
+ r[0]["type"] == "mechanical_ventilation"),
+ 0
+ )
+
+ current_sap_points = int(p.data["current-energy-efficiency"])
+
+ sap_gain = CostOptimiser.calculate_sap_gain_with_slack(
+ epc_to_sap_lower_bound(body.goal_value) - current_sap_points
+ ) - fixed_gain
+
+ if not body.optimise:
+ if body.goal != "Increasing EPC":
+ raise NotImplementedError("Only EPC optimisation is currently supported")
+ solution = []
+ for sub_list in input_measures:
+ # Select the entry with the highest gain, and if tied, choose the one with the lowest cost
+ best_measure = max(sub_list, key=lambda x: (x['gain'], -x['cost']))
+ solution.append(best_measure)
else:
- # The minimum gain is the minimum number of SAP points required to get to the target SAP band
- # If the gain is negative, the optimiser will return an empty solution
- optimiser = CostOptimiser(
- input_measures,
- min_gain=sap_gain
- )
- optimiser.setup()
- optimiser.solve()
- solution = optimiser.solution
+ if body.budget:
+ optimiser = GainOptimiser(
+ input_measures, max_cost=body.budget, max_gain=sap_gain if sap_gain > 0 else 0
+ )
+ else:
+ # The minimum gain is the minimum number of SAP points required to get to the target SAP band
+ # If the gain is negative, the optimiser will return an empty solution
+ optimiser = CostOptimiser(
+ input_measures,
+ min_gain=sap_gain
+ )
- selected_recommendations = {r["id"] for r in solution}
+ optimiser.setup()
+ optimiser.solve()
+ solution = optimiser.solution
+
+ selected_recommendations = {r["id"] for r in solution}
+
+ if property_required_measures:
+ # We select the cheapest of the required measures, into selected
+ for recs in property_required_measures:
+ # We select the cheapest of the required measures
+ cost_to_id = {
+ rec["recommendation_id"]: rec["total"] for rec in recs
+ if rec["recommendation_id"] not in selected_recommendations
+ }
+ # Take the recommendation id with the lowers cost
+
+ selected_recommendations.add(min(cost_to_id, key=cost_to_id.get))
+ # Update the solution with the selected recommendaitons
+ solution = []
+ for recs in recommendations[p.id]:
+ for rec in recs:
+ if rec["recommendation_id"] in selected_recommendations:
+ solution.append(
+ {
+ "id": rec["recommendation_id"],
+ "cost": rec["total"],
+ "gain": rec["sap_points"],
+ "type": rec["type"]
+ }
+ )
# If wall insulation is selected, we also include mechanical ventilation as a best practice measure
- if any(x in [r["type"] for r in solution] for x in [
- "internal_wall_insulation", "external_wall_insulation", "cavity_wall_insulation"
- ]):
+ if any(x in [r["type"] for r in solution] for x in assumptions.measures_needing_ventilation):
ventilation_rec = next(
(r[0] for r in recommendations[p.id] if r[0]["type"] == "mechanical_ventilation"),
None
@@ -717,10 +855,57 @@ async def trigger_plan(body: PlanTriggerRequest):
]
# We'll also unlist the recommendations so they're a bit easier to handle from here onwards
- final_recommendations = [
+ recommendations[p.id] = [
rec for recommendations_by_type in final_recommendations for rec in recommendations_by_type
]
- recommendations[p.id] = final_recommendations
+
+ # when we have buildings, we tweak our solar PV recommendations as if one unit needs it, we apply it to all
+ # of them
+ # TODO: We can probably do better and optimise at the building level - this is temp
+ logger.info("Adjusting solar PV recommendations for buildings")
+ building_ids = set([p.building_id for p in input_properties if p.building_id is not None])
+
+ for bid in building_ids:
+ # We check if any of them have solar PV
+ building = [p for p in input_properties if p.building_id == bid]
+ has_solar = False
+ for unit in building:
+ # Get default recommendations
+ has_solar = len([r for r in recommendations[unit.id] if r["default"] and r["type"] == "solar_pv"]) > 0
+ if has_solar:
+ break
+
+ if has_solar:
+ # We adjust the units within the building
+ for unit in building:
+ for rec in recommendations[unit.id]:
+ if rec["type"] == "solar_pv":
+ # This is straightforward, we just set the default to True, since when we're at a building
+ # level, we only allow 1 solar PV option for each unit. If we change this, this logic will
+ # need to be updated
+ rec["default"] = True
+
+ # ~~~~~~~~~~~~~~~~
+ # Funding
+ # ~~~~~~~~~~~~~~~~
+
+ # for p in input_properties:
+ # funding_calulator = Funding(
+ # tenure=body.housing_type,
+ # starting_epc=p.data["current-energy-rating"],
+ # starting_sap=int(p.data["current-energy-efficiency"]),
+ # postcode=p.postcode,
+ # floor_area=p.floor_area,
+ # council_tax_band=None, # This is seemingly always None at the moment
+ # property_recommendations=recommendations[p.id],
+ # project_scores_matrix=eco_project_scores_matrix,
+ # whlg_eligible_postcodes=whlg_eligible_postcodes,
+ # gbis_abs_rate=15,
+ # eco4_abs_rate=15,
+ # )
+ # funding_calulator.check_eligibiltiy()
+ # # Insert finding
+ # p.insert_funding(funding_calulator)
logger.info("Uploading recommendations to the database")
# If we have any work to do, we create a new scenario
@@ -759,7 +944,11 @@ async def trigger_plan(body: PlanTriggerRequest):
new_epc = sap_to_epc(new_sap_points)
new_epc_bands[p.id] = new_epc
- valuations = PropertyValuation.estimate(property_instance=p, target_epc=new_epc)
+ total_cost = sum([r["total"] for r in default_recommendations])
+
+ valuations = PropertyValuation.estimate(
+ property_instance=p, target_epc=new_epc, total_cost=total_cost
+ )
property_value_increase_ranges[p.id] = valuations
if p.is_new:
@@ -844,6 +1033,7 @@ async def trigger_plan(body: PlanTriggerRequest):
# Commit final changes
session.commit()
+
except IntegrityError:
logger.error("Database integrity error occurred", exc_info=True)
session.rollback()
diff --git a/backend/app/plan/schemas.py b/backend/app/plan/schemas.py
index f84912fe..5db3d4d1 100644
--- a/backend/app/plan/schemas.py
+++ b/backend/app/plan/schemas.py
@@ -37,6 +37,7 @@ MEASURE_MAP = {
VALID_GOALS = ["Increasing EPC"]
VALID_HOUSING_TYPES = ["Social", "Private"]
+VALID_EVENT_TYPES = ["remote_assessment"]
# Define the validation function for inclusions/exclusions
@@ -56,10 +57,16 @@ def check_housing_type(value: str) -> str:
return value
+def check_event_type(value: str) -> str:
+ assert value in VALID_EVENT_TYPES, f"{value} is not a valid event type"
+ return value
+
+
# Use Annotated with BeforeValidator for each list item validation
InclusionOrExclusionItem = Annotated[str, BeforeValidator(check_inclusion_or_exclusion)]
Goal = Annotated[str, BeforeValidator(check_goals)]
HousingType = Annotated[str, BeforeValidator(check_housing_type)]
+EventType = Annotated[str, BeforeValidator(check_event_type)]
class PlanTriggerRequest(BaseModel):
@@ -75,8 +82,17 @@ class PlanTriggerRequest(BaseModel):
valuation_file_path: Optional[str] = None
exclusions: Optional[List[InclusionOrExclusionItem]] = Field(default=None, min_length=1)
inclusions: Optional[List[InclusionOrExclusionItem]] = Field(default=None, min_length=1)
+ # This is a list of measures that we want to be included, if they are options
+ # Default to empty
+ required_measures: Optional[List[InclusionOrExclusionItem]] = Field(default=[], min_length=1)
scenario_name: Optional[str] = ""
multi_plan: Optional[bool] = False
optimise: Optional[bool] = True
default_u_values: Optional[bool] = True
+
+ ashp_cop: Optional[float] = 2.8
+
+ # When performing a remote assessment, if this has been set, it will allow the engine to
+ # pull data from the find my epc website, to utilise as part of a remote assessment
+ event_type: Optional[float] = "remote_assessment",
diff --git a/backend/app/plan/utils.py b/backend/app/plan/utils.py
index 07d4642d..34fb02e7 100644
--- a/backend/app/plan/utils.py
+++ b/backend/app/plan/utils.py
@@ -1,9 +1,5 @@
-import pandas as pd
-from backend.Property import Property
from utils.s3 import read_from_s3
-from recommendations.recommendation_utils import get_wall_u_value, get_floor_u_value, get_roof_u_value
-
from backend.app.config import get_settings
import msgpack
diff --git a/backend/ml_models/AnnualBillSavings.py b/backend/ml_models/AnnualBillSavings.py
index 211e5ea6..b22837d8 100644
--- a/backend/ml_models/AnnualBillSavings.py
+++ b/backend/ml_models/AnnualBillSavings.py
@@ -28,8 +28,8 @@ class AnnualBillSavings:
# Latest price cap figures from Ofgem are for April 2024
# https://www.ofgem.gov.uk/energy-price-cap
- ELECTRICITY_PRICE_CAP = 0.2236
- GAS_PRICE_CAP = 0.0548
+ ELECTRICITY_PRICE_CAP = 0.2486
+ GAS_PRICE_CAP = 0.0634
# This is the most recent export payment figure, at 9.28p/kWh
# Smart export guarantee rates can be found here:
# https://www.sunsave.energy/solar-panels-advice/exporting-to-the-grid/best-seg-rates
@@ -39,8 +39,8 @@ class AnnualBillSavings:
PRICE_FACTOR = 0.09549999999999999
# Daily standard charge, based on average across England, Scotland and Wales, and includes VAT
- DAILY_STANDARD_CHARGE_GAS = 0.3143
- DAILY_STANDARD_CHARGE_ELECTRICITY = 0.601
+ DAILY_STANDARD_CHARGE_GAS = 0.3165
+ DAILY_STANDARD_CHARGE_ELECTRICITY = 0.6097
# Based on https://www.nottenergy.com/advice-and-tools/project-energy-cost-comparison
# For July 2024. These quotes are based on the east midlands region, so we
diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py
index 92c55641..6d4852b2 100644
--- a/backend/ml_models/Valuation.py
+++ b/backend/ml_models/Valuation.py
@@ -1,5 +1,4 @@
import numpy as np
-from scipy.constants import value
class PropertyValuation:
@@ -203,12 +202,43 @@ class PropertyValuation:
return msm_increase, lloyds_increase
@classmethod
- def estimate(cls, property_instance, target_epc):
+ def estimate(cls, property_instance, target_epc, total_cost=None):
+ """
+ This function estimates the value of a property based on the current EPC rating and the target EPC rating
+ :param property_instance: An instance of the Property class
+ :param target_epc: The target EPC rating
+ :param total_cost: The total cost of the retrofit
+ :return:
+ """
current_value = (
property_instance.valuation if property_instance.valuation else
cls.UPRN_VALUE_LOOKUP.get(property_instance.uprn)
)
+ current_epc = property_instance.data["current-energy-rating"]
+
+ if not current_value:
+ return {
+ "current_value": 0,
+ "lower_bound_increased_value": 0,
+ "upper_bound_increased_value": 0,
+ "average_increased_value": 0,
+ "average_increase": 0
+ }
+
+ return cls.estimate_valuation_improvement(current_value, current_epc, target_epc, total_cost)
+
+ @classmethod
+ def estimate_valuation_improvement(cls, current_value, current_epc, target_epc, total_cost=None):
+ """
+ This function estimates the value of a property based on the current EPC rating and the target EPC rating
+ :param current_value:
+ :param current_epc:
+ :param target_epc:
+ :param total_cost:
+ :return:
+ """
+
if not current_value:
return {
"current_value": 0,
@@ -218,7 +248,6 @@ class PropertyValuation:
"average_increase": 0
}
- current_epc = property_instance.data["current-energy-rating"]
# We get the spectrum of ratings between the current and target EPC
epc_band_range = cls.EPC_BANDS[cls.EPC_BANDS.index(current_epc): cls.EPC_BANDS.index(target_epc) + 1]
@@ -242,6 +271,19 @@ class PropertyValuation:
avg_increase = np.mean(all_increases)
+ if total_cost is not None:
+ # We CAP the retrofit ROI at 2
+ avg_increase_value = current_value * avg_increase
+ if avg_increase_value / total_cost > 2:
+ # We re-scale the % so that the average value increase is no more than 2 times the total cost
+ double_cost = 2 * total_cost
+ new_avg_increase = double_cost / current_value
+ scalar = new_avg_increase / avg_increase
+ # We scale the min and max increases by the same scalar
+ min_increase *= scalar
+ max_increase *= scalar
+ avg_increase = new_avg_increase
+
return {
"current_value": current_value,
"lower_bound_increased_value": float(current_value * (1 + min_increase)),
diff --git a/backend/ml_models/api.py b/backend/ml_models/api.py
index c2f2dcd9..c108f1b7 100644
--- a/backend/ml_models/api.py
+++ b/backend/ml_models/api.py
@@ -39,6 +39,7 @@ class ModelApi:
timestamp,
prediction_buckets,
base_url="https://api.dev.hestia.homes",
+ max_retries=2,
):
"""
This class handles the communication with the Model APIs. These models include SAP change, heat demain change
@@ -54,6 +55,8 @@ class ModelApi:
self.timestamp = timestamp
self.prediction_buckets = prediction_buckets
+ self.max_retries = max_retries
+
@staticmethod
def predictions_template():
return {
@@ -295,15 +298,33 @@ class ModelApi:
async def run_batches():
for chunk in tqdm(to_loop_over, total=len(to_loop_over)):
- predictions_dict = await self.predict_all_async(
- df=data.iloc[chunk:chunk + batch_size],
- bucket=bucket,
- model_prefixes=model_prefixes,
- extract_ids=extract_ids
- )
- for key, scored in predictions_dict.items():
- all_predictions[key] = pd.concat([all_predictions[key], scored])
+ attempts = 0
+ success = False
+ while attempts <= self.max_retries and not success:
+ try:
+ predictions_dict = await self.predict_all_async(
+ df=data.iloc[chunk:chunk + batch_size],
+ bucket=bucket,
+ model_prefixes=model_prefixes,
+ extract_ids=extract_ids
+ )
+
+ for key, scored in predictions_dict.items():
+ all_predictions[key] = pd.concat([all_predictions[key], scored])
+
+ success = True
+ except Exception as e:
+ attempts += 1
+ logger.error(
+ f"Batch {chunk}-{chunk + batch_size} failed (Attempt {attempts}/{self.max_retries}). "
+ f"Error: {e}"
+ )
+
+ if attempts > self.max_retries:
+ logger.error(
+ f"Skipping batch {chunk}-{chunk + batch_size} after {self.max_retries} failed attempts."
+ )
# Check if there is an existing event loop
try:
diff --git a/backend/requirements/requirements.txt b/backend/requirements/requirements.txt
index dd5c34ca..577776be 100644
--- a/backend/requirements/requirements.txt
+++ b/backend/requirements/requirements.txt
@@ -29,3 +29,5 @@ mip==1.15.0
pyarrow==17.0.0
fastparquet==2024.5.0
aiohttp==3.10.10
+# find my epc
+beautifulsoup4
diff --git a/backend/tests/test_search_epc.py b/backend/tests/test_search_epc.py
new file mode 100644
index 00000000..562585ad
--- /dev/null
+++ b/backend/tests/test_search_epc.py
@@ -0,0 +1,59 @@
+import pytest
+import os
+from backend.SearchEpc import SearchEpc # Replace with your actual module name
+from dotenv import load_dotenv
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+
+class TestSearchEpcIntegration:
+ @pytest.mark.parametrize(
+ "address, postcode, uprn, skip_os, expected_partial_address",
+ [
+ # Test case 1: Valid address and postcode, skipping OS
+ # In this case, the property is an individual flat but the uprn associated to the
+ # EPC is for the building as a whole, possibly because there was a conversion of sorts
+ ("Garden Flat, 48 Bedminster Parade", "BS3 4HS", 308249, True,
+ "260907a5431fa073d193cc6bbec51fbf1ba9a61845ab2503f85aa19ce3ed6afd", 1),
+
+ # Test case 2: Another valid address and postcode
+ # In this case, the newest EPC, does not have a uprn associated to it. If we did a search by
+ # uprn, we would get an old EPC
+ ("Flat 8, Hainton House", "DN32 9AQ", 10090082018, True,
+ "bd1149a20a73397184f07a9955f872424826e70f4870c058d71be887766ee1f8", 3),
+
+ ],
+ )
+ def test_find_property(self, address, postcode, uprn, skip_os, lmk_key, n_old_epcs):
+ """
+ Integration test for `find_property`, making actual API calls.
+ """
+ # Provide your actual API keys or tokens here
+ os_api_key = ""
+
+ # Initialize the SearchEpc instance
+ epc_searcher = SearchEpc(
+ address1=address,
+ postcode=postcode,
+ uprn=uprn,
+ auth_token=EPC_AUTH_TOKEN,
+ os_api_key=os_api_key,
+ )
+
+ # Execute the method
+ epc_searcher.find_property(skip_os=skip_os)
+
+ # We check that we have the correct epc
+ assert epc_searcher.newest_epc["lmk-key"] == lmk_key
+ assert epc_searcher.newest_epc["uprn"] == uprn
+ assert len(epc_searcher.older_epcs) == n_old_epcs
+
+ def test_search_housenumber(self):
+ eg1 = 'Flat A11, Mortimer House, Grendon Road, Exeter'
+ res1 = SearchEpc.get_house_number(eg1, None)
+ assert res1 == "A11"
+
+ eg2 = 'Flat A9, Mortimer House, Grendon Road, Exeter, EX1 2NL'
+ res2 = SearchEpc.get_house_number(eg2, None)
+ assert res2 == "A9"
diff --git a/etl/access_reporting/app.py b/etl/access_reporting/app.py
new file mode 100644
index 00000000..8a8254a1
--- /dev/null
+++ b/etl/access_reporting/app.py
@@ -0,0 +1,440 @@
+import os
+from msal import ConfidentialClientApplication
+from datetime import datetime, timedelta
+import requests
+from functools import wraps
+import time
+import logging
+from io import BytesIO
+import pandas as pd
+
+# Configure logging
+logger = logging.getLogger(__name__)
+if not logger.handlers:
+ handler = logging.StreamHandler()
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+ handler.setFormatter(formatter)
+ logger.addHandler(handler)
+logger.setLevel(logging.INFO)
+
+
+def handle_error(response):
+ """
+ Handle errors based on HTTP status codes and log detailed information.
+ """
+ try:
+ error_json = response.json().get('error', {})
+ except ValueError:
+ error_json = {}
+
+ error_code = error_json.get('code', 'unknownError')
+ error_message = error_json.get('message', 'No detailed error message provided.')
+ inner_error = error_json.get('innererror', {})
+ details = error_json.get('details', [])
+
+ logger.error(f"Error Code: {error_code}")
+ logger.error(f"Error Message: {error_message}")
+ if inner_error:
+ logger.error(f"Inner Error: {inner_error}")
+ if details:
+ logger.error(f"Error Details: {details}")
+
+ if response.status_code == 401:
+ logger.error("Unauthorized. Token might be invalid.")
+ elif response.status_code == 403:
+ logger.error("Forbidden. Access denied to the requested resource.")
+ elif response.status_code == 404:
+ logger.error("Not Found. The requested resource doesn’t exist.")
+ elif response.status_code == 429:
+ retry_after = int(response.headers.get('Retry-After', 5)) # Default to 5 seconds if not provided
+ logger.warning(f"Too Many Requests. Retrying after {retry_after} seconds...")
+ time.sleep(retry_after)
+ return 'retry'
+ elif response.status_code in (500, 503):
+ retry_after = int(response.headers.get('Retry-After', 5)) # Default to 5 seconds if not provided
+ logger.error(f"Server error. Retrying after {retry_after} seconds...")
+ time.sleep(retry_after)
+ return 'retry'
+ else:
+ raise ValueError(f"API request failed with status code {response.status_code} - {error_message}")
+
+ raise ValueError(f"API request failed with status code {response.status_code} - {error_message}")
+
+
+def api_call_decorator(func):
+ """
+ Handles various aspects of the API call, including refreshing the access token if needed and handling pagination.
+ :param func: The function to be decorated.
+ :return: The wrapped function.
+ """
+
+ @wraps(func)
+ def wrapper(self, *args, **kwargs):
+ try:
+ # Check and refresh the access token if needed
+ if self.is_access_token_expired():
+ self.retrieve_access_token()
+ logger.info("Access token refreshed.")
+
+ # Get the HTTP method, URL, and optionally data from the function
+ http_method, url, data = func(self, *args, **kwargs)
+
+ # Initialize the results list and handle pagination if page_size is provided
+ results = []
+ page_size = kwargs.get('page_size', None)
+ response_data = {}
+ n_calls = 0
+
+ while url:
+ logger.info("Making call for page: " + str(n_calls + 1))
+ n_calls += 1
+ response = requests.request(http_method, url, headers=self.headers, json=data)
+
+ # Handle the response
+ if response.status_code == 200:
+ response_json = response.json() # Store the response JSON
+ if page_size:
+ results.extend(response_json.get('value', []))
+ url = response_json.get('@odata.nextLink', None)
+ logger.info(f"Next page URL: {url}")
+ else:
+ response_data = response_json # Capture the full response for consistency
+ break
+ else:
+ retry = handle_error(response)
+ if retry == 'retry':
+ continue
+
+ if page_size:
+ response_data = {'value': results}
+
+ return response_data
+
+ except Exception as e:
+ logger.exception("An error occurred during the API call.")
+ raise e
+
+ return wrapper
+
+
+class SharePointClient:
+ access_token = None
+ access_token_request_timestamp = None
+ access_token_expiry = None
+ headers = None
+
+ TIMESTAMP_FORMAT = "%Y-%m-%dT%H:%M:%S.%fZ"
+
+ def __init__(self, tenant_id, client_id, client_secret, site_id, access_token=None,
+ access_token_expiration_details=None):
+ """
+ Initializes the SharePointClient with necessary credentials and site information.
+ :param tenant_id: The tenant ID.
+ :param client_id: The client ID.
+ :param client_secret: The client secret.
+ :param site_id: The site ID.
+ :param access_token: The access token (optional)
+ :param access_token_expiration_details: The access token expiration details (optional)
+ """
+ self.tenant_id = tenant_id
+ self.client_id = client_id
+ self.client_secret = client_secret
+
+ if access_token:
+ if not access_token_expiration_details:
+ raise ValueError("Access token expiration details must be provided.")
+ self.access_token = access_token
+ self.set_access_token_expiration_details(access_token_expiration_details)
+ self.headers = {
+ 'Authorization': f"Bearer {self.access_token['access_token']}"
+ }
+ else:
+ self.retrieve_access_token()
+
+ # Retrieve static identifiers
+ self.site_id = site_id
+ self.document_drive = self.get_documents_drive()
+
+ def get_token_expiration_details(self):
+ """
+ Returns the access token expiration details. Converts the datetime objects to strings for serialization.
+ :return:
+ """
+ return {
+ 'access_token_request_timestamp': datetime.strftime(
+ self.access_token_request_timestamp, self.TIMESTAMP_FORMAT
+ ),
+ 'access_token_expiry': datetime.strftime(self.access_token_expiry, self.TIMESTAMP_FORMAT)
+ }
+
+ def set_access_token_expiration_details(self, access_token_expiration_details):
+ """
+ Sets the access token expiration details from a serialized dictionary.
+ :param access_token_expiration_details: The serialized access token expiration details.
+ :return:
+ """
+ self.access_token_request_timestamp = datetime.strptime(
+ access_token_expiration_details['access_token_request_timestamp'], self.TIMESTAMP_FORMAT
+ )
+ self.access_token_expiry = datetime.strptime(
+ access_token_expiration_details['access_token_expiry'], self.TIMESTAMP_FORMAT
+ )
+
+ def is_access_token_expired(self):
+ """
+ Checks if the access token has expired. If it has, a new access token is retrieved.
+ :return: True if expired, False otherwise.
+ """
+ return datetime.now() >= self.access_token_expiry
+
+ def retrieve_access_token(self, refresh=False):
+ """
+ Implements authentication using MSAL.
+ :param refresh: If True, force a refresh of the access token.
+ :return: None
+ """
+ app = ConfidentialClientApplication(
+ self.client_id,
+ authority=f"https://login.microsoftonline.com/{self.tenant_id}",
+ client_credential=self.client_secret
+ )
+
+ scope = ["https://graph.microsoft.com/.default"]
+
+ access_token_request_timestamp = datetime.now()
+
+ if refresh:
+ logger.info("Forcing refresh of access token.")
+ token = app.acquire_token_for_client(scopes=scope)
+ else:
+ # Check if a token is already cached
+ token = app.acquire_token_silent(scope, account=None)
+
+ if not token:
+ token = app.acquire_token_for_client(scopes=scope)
+
+ if "access_token" not in token:
+ logger.error("Authentication failed.")
+ raise ValueError("Authentication failed")
+
+ access_token_expiry = access_token_request_timestamp + timedelta(
+ seconds=token['expires_in'] - 20
+ )
+
+ self.access_token = token
+ self.access_token_request_timestamp = access_token_request_timestamp
+ self.access_token_expiry = access_token_expiry
+ self.headers = {
+ 'Authorization': f"Bearer {self.access_token['access_token']}"
+ }
+
+ logger.info("Access token retrieved successfully.")
+
+ @api_call_decorator
+ def get_documents_drive(self):
+ """
+ Get the document drive of the SharePoint site.
+ :return: Tuple containing HTTP method, URL, and None for data.
+ """
+ url = f"https://graph.microsoft.com/v1.0/sites/{self.site_id}/drive"
+ logger.info(f"Getting document drive from URL: {url}")
+ return 'GET', url, None
+
+ @api_call_decorator
+ def list_folder_contents(self, drive_id, folder_path: str, page_size: int = 100):
+ """
+ This function will list the contents of a folder in SharePoint.
+ :param drive_id: The ID of the drive.
+ :param folder_path: The path of the folder.
+ :param page_size: The number of items per page (default is 100).
+ :return: Tuple containing HTTP method, URL, and None for data.
+ """
+ url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root:/{folder_path}:/children?$top={page_size}"
+ logger.info(f"Listing folder contents from URL: {url}")
+ return 'GET', url, None
+
+ @staticmethod
+ def download_sharepoint_file(download_url):
+ """
+ Downloads a file from the given URL and returns its content.
+
+ :param download_url: The URL to download the file from.
+ :return: The content of the downloaded file.
+ """
+ response = requests.get(download_url, stream=True)
+ response.raise_for_status() # Check if the request was successful
+
+ file_content = BytesIO()
+
+ # Read the file content into memory
+ for chunk in response.iter_content(chunk_size=8192):
+ file_content.write(chunk)
+
+ file_content.seek(0) # Reset the file pointer to the beginning
+
+ return file_content
+
+ def download_sharepoint_folder(self, drive_id, folder_path, download_dir, excluded_file_types=None):
+ """
+ Downloads all files in a SharePoint folder to the specified local directory.
+
+ :param drive_id: The ID of the SharePoint drive.
+ :param folder_path: The path of the folder in SharePoint.
+ :param download_dir: The local directory to save the downloaded files.
+ :param excluded_file_types: A list of file types to exclude from download (default is None).
+ """
+
+ excluded_file_types = [] if excluded_file_types is None else excluded_file_types
+
+ # Ensure the download directory exists
+ os.makedirs(download_dir, exist_ok=True)
+
+ # List folder contents
+ folder_contents = self.list_folder_contents(drive_id, folder_path)
+ files = folder_contents.get('value', [])
+
+ for item in files:
+ if item.get('folder'): # Check if it's a folder
+ # Recursively handle subfolders
+ subfolder_path = f"{folder_path}/{item['name']}"
+ subfolder_dir = os.path.join(download_dir, item['name'])
+ self.download_sharepoint_folder(drive_id, subfolder_path, subfolder_dir)
+ else:
+ # It's a file, download it
+ file_name = item['name']
+ if file_name.split(".")[-1] in excluded_file_types:
+ continue
+ download_url = item['@microsoft.graph.downloadUrl']
+
+ logger.info(f"Downloading file: {file_name}")
+ file_content = self.download_sharepoint_file(download_url)
+
+ # Save the file locally
+ file_path = os.path.join(download_dir, file_name)
+ with open(file_path, 'wb') as f:
+ f.write(file_content.read())
+
+ logger.info(f"File saved to: {file_path}")
+
+
+def app():
+ # Customers for WC 18/11/2024
+ #
+ # ----- Eastlight location -----
+ # No data this week, low on data
+ # Housing Associations/Eastlight/Survey Outcomes/
+ #
+ # ----- Settle location -----
+ # No data this week, in separate files
+ # Housing Associations/Settle/Survey Outcomes/
+ #
+ # ----- Community Housing -----
+ # In separate files - will we get to a singular form?
+ # Housing Associations/Community Housing/Survey Outcomes/
+ #
+ # ----- ACIS location -----
+ # Doesn't have this week's data
+ # Housing Asociation/ACIS/Survey Outcomes/ACIS Group - 25.11.2024 - USE THIS.xlsx
+ #
+ # ----- Southern location -----
+ #
+ #
+ # ------ Unitas location ------
+ # Does have this week's data
+ # Unitas location: Housing Associations/Unitas/Survey Outcomes/Unitas.xlsx
+
+ locations = {
+ "Unitas": "Housing Associations/Unitas/Survey Outcomes/Unitas.xlsx",
+ "Eastlight": "Housing Associations/Eastlight/Survey Outcomes/",
+ "Settle": "Housing Associations/Settle/Survey Outcomes/",
+ "Community Housing": "Housing Associations/Community Housing/Survey Outcomes/",
+ "ACIS": "Housing Asociation/ACIS/Survey Outcomes/ACIS Group - 25.11.2024 - USE THIS.xlsx",
+ "Southern": None,
+ }
+
+ SHAREPOINT_CLIENT_ID = os.getenv("SHAREPOINT_CLIENT_ID", None)
+ SHAREPOINT_CLIENT_SECRET = os.getenv("SHAREPOINT_CLIENT_SECRET", None)
+ SHAREPOINT_TENANT_ID = os.getenv("SHAREPOINT_TENANT_ID", None)
+ WARMFRONT_SHAREPOINT_SITE_ID = os.getenv("WARMFRONT_SHAREPOINT_SITE_ID", None)
+
+ sharepoint_client = SharePointClient(
+ tenant_id=SHAREPOINT_TENANT_ID,
+ client_id=SHAREPOINT_CLIENT_ID,
+ client_secret=SHAREPOINT_CLIENT_SECRET,
+ site_id=WARMFRONT_SHAREPOINT_SITE_ID
+ )
+
+ results = []
+ for customer, location in locations.items():
+ if location is None:
+ continue
+
+ if location.endswith(".xlsx"):
+ # Read in the file
+ # List the contents of the folder
+ location_folder = os.path.dirname(location)
+ contents = sharepoint_client.list_folder_contents(
+ drive_id=sharepoint_client.document_drive["id"],
+ folder_path=location_folder
+ )
+ filepaths = contents["value"]
+
+ download_url = next(
+ (file['@microsoft.graph.downloadUrl'] for file in filepaths
+ if '@microsoft.graph.downloadUrl' in file and file['name'] == os.path.basename(location)),
+ None
+ )
+
+ if download_url is None:
+ raise ValueError("File not found in the SharePoint folder.")
+
+ file_content = sharepoint_client.download_sharepoint_file(download_url)
+
+ # Convert to pandas dataframe since file is an excel file
+ df = pd.read_excel(file_content)
+ df["Outcome"] = df["Outcome"].str.strip().str.lower()
+
+ # We cannot group by funding type accurately because any job that is not funded will have a NaN value
+ # and therefore we have a 100% acces rate for funded jobs and 0% otherwise
+ surveyor_outcomes = []
+ for (week, surveyor, funding), group in df.groupby(["Week Commencing", "DEA/REA"]):
+ funding_type = [x for x in group["Funding Type"].unique() if not pd.isnull(x)]
+ if funding_type:
+ funding_type = " + ".join(funding_type)
+ else:
+ funding_type = "No Funding"
+ surveyed = group[group["Outcome"] == "surveyed"]
+ no_answer = group[
+ group["Outcome"] == "no answer"
+ ]
+ other_issue = group[~group["Outcome"].isin(["surveyed", "no answer"])]
+
+ surveyor_outcomes.append(
+ {
+ "Surveyor": surveyor,
+ "Week": week,
+ "Funding": funding_type,
+ "Surveyed": surveyed.shape[0],
+ "No Answer": no_answer.shape[0],
+ "Other Issue": other_issue.shape[0],
+ }
+ )
+
+ surveyor_outcomes = pd.DataFrame(surveyor_outcomes)
+ surveyor_outcomes["Week"] = pd.to_datetime(surveyor_outcomes["Week"])
+
+ weekly_access = (
+ surveyor_outcomes.drop(columns=["Surveyor"]).groupby(["Week", "Funding"]).sum().reset_index()
+ )
+ # Sort by week and surveyor ascending
+ surveyor_outcomes = surveyor_outcomes.sort_values(["Week", "Surveyor"], ascending=[True, True])
+ surveyor_outcomes["Access Rate"] = 100 * surveyor_outcomes["Surveyed"] / (
+ surveyor_outcomes["Surveyed"] + surveyor_outcomes["No Answer"] + surveyor_outcomes["Other Issue"]
+ )
+
+ weekly_access["Total"] = (
+ weekly_access["Surveyed"] + weekly_access["No Answer"] + weekly_access["Other Issue"]
+ )
+ weekly_access["Access Rate"] = 100 * weekly_access["Surveyed"] / (
+ weekly_access["Surveyed"] + weekly_access["No Answer"] + weekly_access["Other Issue"]
+ )
diff --git a/etl/access_reporting/requirements.txt b/etl/access_reporting/requirements.txt
new file mode 100644
index 00000000..8e6dbb08
--- /dev/null
+++ b/etl/access_reporting/requirements.txt
@@ -0,0 +1,11 @@
+python-docx==0.8.11
+PyPDF2==3.0.1
+boto3
+requests
+pandas
+pyarrow==12.0.1
+openpyxl==3.1.2
+usaddress==0.5.10
+pdfplumber==0.10.3
+msgpack==1.0.5
+msal
\ No newline at end of file
diff --git a/etl/costs/app.py b/etl/costs/app.py
index 797191d2..f2bf365b 100644
--- a/etl/costs/app.py
+++ b/etl/costs/app.py
@@ -11,7 +11,7 @@ import inspect
src_file_path = inspect.getfile(lambda: None)
-DATA_DIRECTORY = Path(src_file_path).parent / "local_data" / "20240917 Hestia Materials.xlsx"
+DATA_DIRECTORY = Path(src_file_path).parent / "local_data" / "20250316 Domna Materials.xlsx"
# Environment file is at the same level as this file
ENV_FILE = Path(src_file_path).parent / "etl" / "costs" / ".env"
dotenv.load_dotenv(ENV_FILE)
@@ -91,6 +91,7 @@ def app():
lel_costs = pd.read_excel(DATA_DIRECTORY, sheet_name="low_energy_lighting", header=0)
flat_roof_costs = pd.read_excel(DATA_DIRECTORY, sheet_name="flat_roof_insulation", header=0)
window_costs = pd.read_excel(DATA_DIRECTORY, sheet_name="window_glazing", header=0)
+ rir_insulation_costs = pd.read_excel(DATA_DIRECTORY, sheet_name="room_roof_insulation", header=0)
# Form a single table to be uploaded
costs = pd.concat(
@@ -104,7 +105,8 @@ def app():
ewi_costs,
lel_costs,
flat_roof_costs,
- window_costs
+ window_costs,
+ rir_insulation_costs,
]
)
diff --git a/etl/customers/aiha/bid_numbers.py b/etl/customers/aiha/bid_numbers.py
new file mode 100644
index 00000000..b371e2e5
--- /dev/null
+++ b/etl/customers/aiha/bid_numbers.py
@@ -0,0 +1,106 @@
+"""
+This is an adhoc script, used to pull together some of the figures that are being included in the
+Warm Homes: Social Housing Wave 3 funding application
+"""
+
+import pandas as pd
+import numpy as np
+
+aiha_all_units = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Downloads/AIHA Measures Packages 2024_11_13.xlsx",
+ sheet_name="All Properties - AIHA",
+ header=2
+)
+modelled_units = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Downloads/AIHA Measures Packages 2024_11_13.xlsx",
+ sheet_name="Modelled Properties - Measures",
+ header=5
+)
+aiha_all_units = aiha_all_units.drop(columns=['Unnamed: 0', 'Unnamed: 1'])
+aiha_extracted_property_data = pd.read_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/extracted_property_data.csv"
+)
+aiha_wave_3_units = aiha_all_units[aiha_all_units["Expected Package Cost"].astype(float) > 0]
+# TODO: The EPC C property isn't a C!
+aiha_epc_breakdown = aiha_wave_3_units["Expected EPC Rating"].replace({"D or E": "E"}).value_counts()
+# For CAHA
+caha_epc_breakdown = modelled_units[
+ modelled_units['Survey Key'].str.contains("CAHA")
+]['Current EPC Rating'].value_counts()
+# For Hornsey
+hornsey_epc_breakdown = modelled_units[
+ modelled_units['Survey Key'].str.contains("HORNSEY")
+]['Current EPC Rating'].value_counts()
+
+aiha_original_asset_data = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/240924- KSQ & Domna Info Merge - AIHA - SHDF Wave 3 "
+ "bid - Supplementary information.xlsx",
+ sheet_name="Archetyping Data",
+ header=2
+)
+
+# Get the units in the bid:
+aiha_wave_3_features = aiha_original_asset_data[
+ ['Address letter or number', 'Street address', 'Postcode', "Wall type",
+ "Property type", "built-form", "floor"]
+].merge(
+ aiha_wave_3_units[['Address letter or number', 'Street address', 'Postcode']],
+ how="inner",
+ on=["Address letter or number", "Street address", "Postcode"]
+)
+
+wall_type_breakdown = aiha_wave_3_features["Wall type"].value_counts()
+property_type_breakdown = aiha_wave_3_features.groupby(["Property type", "floor"]).size().reset_index()
+
+aiha_wave_3_features[aiha_wave_3_features["Property type"] == "Flat"][["Street address", "Postcode"]]
+
+# 4 Yetev Lev Court ... Semi-Detached mid - Medium
+# B 86 Bethune Road ... Mid-Terrace top. - Low
+# A 80 Bethune Road ... Mid-Terrace ground. - Low
+# B 80 Bethune Road ... \n \n - Low
+# A 9 Clapton Common ... Semi-Detached ground. - Low
+# C 9 Clapton Common ... End-Terrace \n. - Low
+# B 89 Manor Road ... \n \n. - Low
+# A 6 Northfield Road ... Detached top. - Low
+# 13 Northfield Rd ... Semi-Detached \n - Low
+# A 73 Manor Road ... End-Terrace \n - Low
+# B 73 Manor Road ... Detached top - Low
+
+# Hornsey data - contained in original asset list
+hornsey_asset_list = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/SHDF - Template - EOI - Hornsey Housing "
+ "Trust.xlsx",
+ sheet_name="Ksquared-All units information",
+ header=3
+)
+
+# We don't need the first row
+hornsey_asset_list = hornsey_asset_list.iloc[1:]
+# Fill NA values with empty strings
+hornsey_asset_list = hornsey_asset_list.fillna("")
+hornsey_asset_list["Address letter or number"] = hornsey_asset_list["Address letter or number"].astype(
+ str
+).str.strip()
+hornsey_asset_list["Postcode"] = hornsey_asset_list["Postcode"].astype(str).str.strip()
+hornsey_asset_list["Street address"] = hornsey_asset_list["Street address"].astype(str).str.strip()
+# Replace double spaces
+for col in ["Address letter or number", "Street address", "Postcode"]:
+ hornsey_asset_list[col] = hornsey_asset_list[col].str.replace(" ", " ")
+
+hornsey_asset_list = hornsey_asset_list[hornsey_asset_list["Address letter or number"] != ""]
+
+hornsey_asset_list["Wall Type Cleaned"] = np.where(
+ hornsey_asset_list["Wall type"].str.contains("Cavity"),
+ "Cavity",
+ "Solid"
+)
+
+hornsey_asset_list["Property type"].value_counts()
+
+# CAHA
+caha_epc_data = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/caha_extracted_property_data.xlsx"
+)
+
+caha_epc_data[caha_epc_data["address"] != "33 Woodhouse Road"]["property_type"].value_counts()
+caha_epc_data[caha_epc_data["address"] != "33 Woodhouse Road"]["wall_type"].value_counts()
diff --git a/etl/customers/aiha/xml_extraction.py b/etl/customers/aiha/xml_extraction.py
new file mode 100644
index 00000000..44baef80
--- /dev/null
+++ b/etl/customers/aiha/xml_extraction.py
@@ -0,0 +1,988 @@
+import os
+from io import BytesIO
+
+import pandas as pd
+
+from etl.xml_survey_extraction.XmlParser import XmlParser
+
+SURVEY_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/RESIDENT SURVEYS"
+CONTINGENCY_RATE = 0.26
+
+
+def sap_to_epc(sap_points: int | float):
+ """
+ Simple utility function to convert SAP points to EPC rating.
+ :param sap_points: numerical value of SAP points, typically between 0 and 100
+ :return:
+ """
+
+ if sap_points <= 0:
+ raise ValueError("SAP points should be above 0.")
+
+ if sap_points >= 92:
+ return "A"
+ elif sap_points >= 81:
+ return "B"
+ elif sap_points >= 69:
+ return "C"
+ elif sap_points >= 55:
+ return "D"
+ elif sap_points >= 39:
+ return "E"
+ elif sap_points >= 21:
+ return "F"
+ else:
+ return "G"
+
+
+def main():
+ """
+ This script handles the extraction of data from the XML files in the survey folders.
+ :return:
+ """
+ # Step 1: List all subfolders inside SURVEY_FOLDER_PATH.
+ subfolders = [f.path for f in os.scandir(SURVEY_FOLDER_PATH) if f.is_dir()]
+
+ # Step 2: Loop through each subfolder and find the XML files.
+ extracted_surveys = []
+ for subfolder in subfolders:
+ print(f"Searching in subfolder: {subfolder}")
+
+ # Find all XML files in the current subfolder.
+ xml_files = [f for f in os.listdir(subfolder) if f.endswith('.xml')]
+
+ if not xml_files:
+ print(f"No XML files found in subfolder: {subfolder}")
+ continue
+
+ # If any XML files are found, perform the data extraction. We use the subfolder name as the survey key.
+ for xml_file in xml_files:
+ xml_path = os.path.join(subfolder, xml_file)
+ print(f"Processing XML file: {xml_path}")
+
+ # Read in the XML and parse it using the XmlParser class.
+ with open(xml_path, 'rb') as file:
+ xml_data_io = BytesIO(file.read())
+ uprn = None # Set the UPRN if available.
+
+ # Create an XmlParser instance
+ xml_parser = XmlParser(
+ file=xml_data_io,
+ filekey=xml_path,
+ surveyor_company="",
+ uprn=uprn,
+ )
+
+ # Run the parser to extract the data
+ xml_parser.run()
+ if not xml_parser.epc:
+ # If we don't have a lig xml
+ continue
+
+ # Store the extracted data for further processing
+ extracted_surveys.append({
+ "survey_key": subfolder.split("/")[-1],
+ **xml_parser.epc,
+ **xml_parser.additional_data
+ })
+
+ print(f"Extracted {len(extracted_surveys)} surveys.")
+ # Process the extracted_surveys as needed, for example, save to a database or write to a file.
+ extracted_surveys = pd.DataFrame(extracted_surveys)
+
+ # THis is the data we need for the AIHA project
+ measures_data = extracted_surveys[
+ ["survey_key", "address", "postcode", "current-energy-efficiency", "current-energy-rating",
+ "number_of_floors", "walls-description", "property-type", "built-form"]
+ ]
+ measures_data = measures_data.sort_values("survey_key", ascending=True)
+ measures_data.to_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/extracted_property_data.csv",
+ )
+
+ # Note:
+ # The properties will still have "Very poor" ratings for their hot water
+
+ # TODO
+ # - AIH001-03 has a loft that is inaccessible - ask Chenai about why this property didn't have access to the loft
+ # [Can't remember, not clear - Chenai will check]
+ # - AIH001-08 and AIH001-09, check if it's freehold - could solar work as both of these units are part of the same
+ # buulding [Question for Lewis & Kevin]
+ # - AIH001-09 - Is it not possible to install a loft hatch? [IT IS NOT, NO ACCESS - would need to accessed from
+ # the other unit]
+ # - AIH001-09 - Is there definitely an immersion water heater? Is this definitely the case for the other units?
+ # [Question for Lewis & Kevin] - [YES - ASHP!!!!]
+
+ # TODO: Check which properties are in a conservation area
+ # TODO: AIH001-16 - Is the loft insulation suitable (already has 100mm in the RIR)
+ # TODO: Adjust Archetype 14 homes to exclude double glazing? Or should we exclude entirely
+
+ recommended_measures = [
+ {
+ "survey_key": "AIH001-01",
+ "starting_sap": 69,
+ "recommended_measures": [],
+ "notes": "Is EPC C"
+ },
+ {
+ "survey_key": "AIH001-02",
+ "starting_sap": 65,
+ "recommended_measures": [
+ {
+ "measure": "Solar PV",
+ "description": "2.4kWp Solar PV system",
+ "config": [
+ {
+ "size": "2.4W",
+ "orientation": "Horizontal",
+ "elavation": 30,
+ "overshading": "None or little",
+ }
+ ],
+ "sap_points": 7,
+ "ending_sap": 72,
+ "notes": "The array can be mounted on the flat roof, so that panels are south facing"
+ },
+ {
+ "measure": "TTZC",
+ "description": "Smart Thermostat",
+ "sap_points": 4,
+ "ending_sap": 76
+ }
+ ],
+ },
+ {
+ "survey_key": "AIH001-03",
+ "starting_sap": 43,
+ "recommended_measures": [
+ {
+ "measure": "Cylinder Insulation",
+ "description": "80mm cylinder insulation",
+ "sap_points": 1,
+ "ending_sap": 44,
+ },
+ {
+ "measure": "Solar PV",
+ "description": "4kWp Solar PV system",
+ "config": [
+ {
+ "size": "4kWp",
+ "orientation": "East",
+ "elavation": 30,
+ "overshading": "None or little",
+ },
+ ],
+ "sap_points": 10,
+ "ending_sap": 54
+ },
+ {
+ "measure": "Air Source Heat Pump",
+ "description": "Ecoforest ecoAIR EVI 4-20 20kW air source heat pump (+TTZC)",
+ "sap_points": 20,
+ "ending_sap": 74
+ },
+ {
+ "measure": "Tariff Review",
+ "description": "Switch to 24-hour tariff",
+ "sap_points": 15,
+ "ending_sap": 89
+ }
+ ],
+ "notes": "Unclear if the loft is accessible"
+ },
+ {
+ "survey_key": "AIH001-04",
+ "starting_sap": 48,
+ "recommended_measures": [
+ {
+ "measure": "Flat Roof Insulation",
+ "description": "100mm flat roof insulation",
+ "floor_area": 39.1482, # based on area of top floor
+ "sap_points": 4,
+ "ending_sap": 52
+ },
+ {
+ "measure": "TTZC",
+ "description": "Smart Thermostat",
+ "sap_points": 3,
+ "ending_sap": 55
+ },
+ {
+ "measure": "Solar PV",
+ "description": "4kWp Solar PV system",
+ "config": [
+ {
+ "size": "4kWp",
+ "orientation": "South",
+ "elavation": 30,
+ "overshading": "None or little",
+ }
+ ],
+ "sap_points": 15,
+ "ending_sap": 70
+ }
+ ],
+ "notes": "Roof is flat, PV array should be installed south facing with elevation"
+ },
+ {
+ "survey_key": "AIH001-05",
+ "starting_sap": 54,
+ "recommended_measures": [
+ {
+ "measure": "Flat Roof Insulation",
+ "description": "100mm flat roof insulation",
+ "floor_area": 49.48, # based on area of top floor
+ "sap_points": 5,
+ "ending_sap": 59,
+ },
+ {
+ "measure": "Cylinder Insulation",
+ "description": "80mm cylinder insulation",
+ "sap_points": 2,
+ "ending_sap": 61,
+ },
+ {
+ "measure": "Solar PV",
+ "description": "4kWp Solar PV system",
+ "config": [
+ {
+ "size": "4kW",
+ "orientation": "Horizontal",
+ "elavation": 30,
+ "overshading": "Modest",
+ }
+ ],
+ "sap_points": 9,
+ "ending_sap": 70
+ },
+ {
+ "measure": "TTZC",
+ "description": "Smart Thermostat",
+ "sap_points": 3,
+ "ending_sap": 73
+ }
+ ],
+ "notes": ""
+ },
+ {
+ "survey_key": "AIH001-06",
+ "starting_sap": 62,
+ "recommended_measures": [
+ {
+ "measure": "Cylinder Insulation",
+ "description": "80mm cylinder insulation",
+ "sap_points": 2,
+ "ending_sap": 64,
+ },
+ {
+ "measure": "Solar PV",
+ "description": "2kWp Solar PV system",
+ "config": [
+ {
+ "size": "2kW",
+ "orientation": "South",
+ "elavation": 30,
+ "overshading": "Modest",
+ }
+ ],
+ "sap_points": 6,
+ "ending_sap": 70
+ }
+ ]
+ },
+ {
+ "survey_key": "AIH001-07",
+ "starting_sap": 74,
+ "recommended_measures": [],
+ "notes": "Is EPC C"
+ },
+ {
+ "survey_key": "AIH001-08",
+ "starting_sap": 56,
+ "recommended_measures": [
+ {
+ "measure": "Loft Insulation",
+ "description": "300mm loft insulation",
+ "floor_area": 54.2864, # Based on area of top floor
+ "sap_points": 2,
+ "ending_sap": 58,
+ },
+ {
+ "measure": "Cylinder Insulation",
+ "description": "80mm cylinder insulation",
+ "sap_points": 4,
+ "ending_sap": 62,
+ },
+ {
+ "measure": "Internal Wall Insulation",
+ "description": "100mm internal wall insulation",
+ "hlp": 24.13 * 2.63,
+ "sap_points": 7,
+ "ending_sap": 69,
+ },
+ {
+ "measure": "Ventilation",
+ "description": "2x DMEV fans",
+ "sap_points": 0,
+ "ending_sap": 69,
+ }
+ ]
+ },
+ {
+ "survey_key": "AIH001-09",
+ "starting_sap": 44,
+ "recommended_measures": [
+ {
+ "measure": "Internal Wall Insulation",
+ "description": "100mm internal wall insulation",
+ "hlp": (22.35 * 3.24) + (22.13 * 2.53),
+ "sap_points": 8,
+ "ending_sap": 52,
+ },
+ {
+ "measure": "Cavity Wall Insulation",
+ "description": "CWI to rdSAP default standard",
+ "hlp": (2.68 * 2.39) + (5.93 * 2.63) + (6.13 * 2.39), # 1st & 2nd extension
+ "sap_points": 1,
+ "ending_sap": 53,
+ },
+ {
+ "measure": "Ventilation",
+ "description": "2x DMEV fans",
+ "sap_points": 0,
+ "ending_sap": 53,
+ },
+ {
+ "measure": "TTZC",
+ "description": "Smart Thermostat",
+ "sap_points": 3,
+ "ending_sap": 56,
+ },
+ {
+ "measure": "Solar PV",
+ "description": "1.6kWp Solar PV system",
+ "config": [
+ {
+ "size": "1.6W",
+ "orientation": "South-East",
+ "elavation": 30,
+ "overshading": "None or little",
+ }
+ ],
+ "sap_points": 6,
+ "ending_sap": 62
+ },
+ {
+ "measure": "Loft Insulation",
+ "description": "300mm loft insulation",
+ "floor_area": 63.59 + 12.31, # Based on area of main building and 1st extension
+ "sap_points": 8,
+ "ending_sap": 70,
+ "notes": "Loft is inaccessible from this unit - would need to be accessed from the other unit, "
+ "which is also owned by AIHA"
+ }
+ ],
+ "notes": "This property is a house split into 2 flats. We can install a PV array for both units (one array"
+ "per unit). Area on south-east part of roof is ~22m2 with no overshadowing. Flat roof area is 8m2"
+ "with modest overshadowing. We suggest a 3.2kWp system, across two units"
+ },
+ {
+ "survey_key": "AIH001-11",
+ "starting_sap": 59,
+ "recommended_measures": [
+ {
+ "measure": "TTZC",
+ "description": "Smart Thermostat",
+ "sap_points": 4,
+ "ending_sap": 63,
+ },
+ {
+ "measure": "Internal Wall Insulation",
+ "description": "100mm internal wall insulation",
+ "hlp": (18.50 * 3.12) + (19.00 * 2.75),
+ "sap_points": 5,
+ "ending_sap": 68,
+ },
+ {
+ "measure": "Cylinder Insulation",
+ "description": "80mm cylinder insulation",
+ "sap_points": 1,
+ "ending_sap": 69,
+ }
+ ]
+ },
+ {
+ "survey_key": "AIH001-12",
+ "starting_sap": 46,
+ "recommended_measures": [
+ {
+ "measure": "Double Glazing",
+ "description": "Installation of double glazing",
+ "n_windows": 20, # Counted the bay windows each as 3
+ "windows_area": 10.66,
+ "sap_points": 3,
+ "ending_sap": 49,
+ },
+ # {
+ # "measure": "Solar PV",
+ # "description": "3.2kWp Solar PV system",
+ # "config": [
+ # {
+ # "size": "3.2W",
+ # "orientation": "East",
+ # "elavation": 30,
+ # "overshading": "Little or none",
+ # }
+ # ],
+ # "sap_points": 9,
+ # "ending_sap": 58
+ # },
+ {
+ "measure": "Air Source Heat Pump",
+ "description": "Ecoforest ecoAIR EVI 4-20 20kW air source heat pump (+TTZC)",
+ "sap_points": 15,
+ "ending_sap": 65
+ },
+ {
+ "measure": "Tariff Review",
+ "description": "Switch to 24-hour tariff",
+ "sap_points": 15,
+ "ending_sap": 80
+ }
+ ]
+ },
+ {
+ "survey_key": "AIH001-13",
+ "starting_sap": 53,
+ "recommended_measures": [
+ {
+ "measure": "Roof Insulation",
+ "description": "100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)",
+ "floor_area": 39.75, # based on the floor area of the RIR
+ "sap_points": 6,
+ "ending_sap": 59,
+ },
+ {
+ "measure": "Flat Roof Insulation",
+ "description": "100mm flat roof insulation",
+ "floor_area": 33.06, # Based on area of the extension
+ "sap_points": 2,
+ "ending_sap": 61,
+ },
+ {
+ "measure": "Cavity Wall Insulation",
+ "description": "CWI to rdSAP default standard",
+ "hlp": (35.40 * 2.65) + (26.70 * 2.73) + (16.30 * 2.71), # 1st & 2nd extension
+ "sap_points": 6,
+ "ending_sap": 67,
+ },
+ {
+ "measure": "Ventilation",
+ "description": "2x DMEV fans",
+ "sap_points": 0,
+ "ending_sap": 67,
+ },
+ {
+ "measure": "TTZC",
+ "description": "Smart Thermostat",
+ "sap_points": 2,
+ "ending_sap": 69,
+ },
+ {
+ "measure": "Solar PV",
+ "description": "4kWp Solar PV system",
+ "config": [
+ {
+ "size": "4kW",
+ "orientation": "Horizontal",
+ "elavation": 30,
+ "overshading": "None or little",
+ }
+ ],
+ "sap_points": 9,
+ "ending_sap": 78
+ }
+ ]
+ },
+ {
+ "survey_key": "AIH001-14",
+ "starting_sap": 63,
+ "recommended_measures": [
+ {
+ "measure": "Cavity Wall Insulation",
+ "description": "CWI to rdSAP default standard",
+ "hlp": (11.00 * 2.6) + (11.00 * 2.65) + (4.60 * 2.7),
+ "sap_points": 5,
+ "ending_sap": 68,
+ },
+ {
+ "measure": "Ventilation",
+ "description": "2x DMEV fans",
+ "sap_points": 0,
+ "ending_sap": 68,
+ },
+ {
+ "measure": "Loft Insulation",
+ "description": "300mm loft insulation", # Based on area of main building
+ "floor_area": 59.20,
+ "sap_points": 1,
+ "ending_sap": 69,
+ },
+ {
+ "measure": "Solar PV",
+ "description": "3.2kWp Solar PV system",
+ "sap_points": 10,
+ "ending_sap": 79,
+ }
+ ]
+ },
+ {
+ "survey_key": "AIH001-15",
+ "starting_sap": 60,
+ "recommended_measures": [
+ {
+ "measure": "Loft Insulation",
+ "description": "300mm loft insulation",
+ "floor_area": 73.81, # Based on area of main building
+ "sap_points": 1,
+ "ending_sap": 61,
+ },
+ {
+ "measure": "TTZC",
+ "description": "Smart Thermostat",
+ "sap_points": 3,
+ "ending_sap": 64,
+ },
+ {
+ "measure": "Solar PV",
+ "description": "3.2kWp Solar PV system",
+ "config": [
+ {
+ "size": "3.2W",
+ "orientation": "North-West",
+ "elavation": 30,
+ "overshading": "None or little",
+ }
+ ],
+ "sap_points": 7,
+ "ending_sap": 71,
+ "notes": "The array is North-west facing and therefore will be slightly less efficient than south"
+ "facing, however the impact is not so severe as to make the installation not worthwhile."
+ "Ground mounted"
+ }
+ ]
+ },
+ {
+ "survey_key": "AIH001-16",
+ "starting_sap": 60,
+ "recommended_measures": [
+ {
+ "measure": "Cavity Wall Insulation",
+ "description": "CWI to rdSAP default standard",
+ "hlp": (21.56 * 2.60) + (26.79 * 2.8) + (6.74 * 2.60),
+ "sap_points": 4,
+ "ending_sap": 64,
+ },
+ {
+ "measure": "Ventilation",
+ "description": "2x DMEV fans",
+ "sap_points": 0,
+ "ending_sap": 64,
+ },
+ {
+ "measure": "Loft Insulation",
+ "description": "300mm loft insulation",
+ "floor_area": 20.92, # Based on floor area of RIR
+ "sap_points": 1,
+ "ending_sap": 65,
+ },
+ {
+ "measure": "Solar PV",
+ "description": "2.4kWp Solar PV system",
+ "config": [
+ {
+ "size": "2.4W",
+ "orientation": "South-East",
+ "elavation": 30,
+ "overshading": "Modest",
+ }
+ ],
+ "sap_points": 5,
+ "ending_sap": 70,
+ }
+ ]
+ },
+ {
+ "survey_key": "AIH001-17",
+ "starting_sap": 62,
+ "recommended_measures": [
+ {
+ "measure": "Cylinder Insulation",
+ "description": "80mm cylinder insulation",
+ "sap_points": 1,
+ "ending_sap": 63,
+ },
+ {
+ "measure": "TTZC",
+ "description": "Smart Thermostat",
+ "sap_points": 3,
+ "ending_sap": 66,
+ },
+ {
+ "measure": "Solar PV",
+ "description": "4kWp Solar PV system",
+ "config": [
+ {
+ "size": "3.2kW",
+ "orientation": "East",
+ "elavation": 30,
+ "overshading": "None or little",
+ },
+ {
+ "size": "0.8kW",
+ "orientation": "West",
+ "elavation": 30,
+ "overshading": "None or little",
+ }
+ ],
+ "sap_points": 12,
+ "ending_sap": 78,
+ }
+ ]
+ },
+ {
+ "survey_key": "AIH001-18",
+ "starting_sap": 58,
+ "recommended_measures": [
+ {
+ "measure": "Loft Insulation",
+ "description": "300mm loft insulation",
+ "floor_area": 37.52, # Based on area of main building and 1st extension
+ "sap_points": 7,
+ "ending_sap": 65,
+ },
+ {
+ "measure": "Cylinder Insulation",
+ "description": "80mm cylinder insulation",
+ "sap_points": 1,
+ "ending_sap": 66,
+ },
+ {
+ "measure": "TTZC",
+ "description": "Smart Thermostat",
+ "sap_points": 2,
+ "ending_sap": 68,
+ },
+ {
+ "measure": "Solar PV",
+ "description": "3.2kWp Solar PV system",
+ "config": [
+ {
+ "size": "3.2W",
+ "orientation": "North-East",
+ "elavation": 30,
+ "overshading": "None or little",
+ }
+ ],
+ "sap_points": 7,
+ "ending_sap": 75,
+ }
+ ],
+
+ },
+ {
+ "survey_key": "AIH001-19",
+ "starting_sap": 76,
+ "recommended_measures": []
+ },
+ {
+ "survey_key": "AIH001-20",
+ "starting_sap": 82,
+ "recommended_measures": []
+ },
+ {
+ "survey_key": "AIH001-21",
+ "starting_sap": 53,
+ "recommended_measures": [
+ {
+ "measure": "Cylinder Insulation",
+ "description": "80mm cylinder insulation",
+ "sap_points": 2,
+ "ending_sap": 55,
+ },
+ {
+ "measure": "Roof Insulation",
+ "description": "100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)",
+ "floor_area": 22.80, # Based on floor area of RIR
+ "sap_points": 7,
+ "ending_sap": 62,
+ },
+ {
+ "measure": "Solar PV",
+ "description": "2.4kWp Solar PV system",
+ "config": [
+ {
+ "size": "1.6kWp",
+ "orientation": "Horizontal",
+ "elavation": 30,
+ "overshading": "None or little",
+ },
+ {
+ "size": "0.8kWp",
+ "orientation": "South-East",
+ "elavation": 30,
+ "overshading": "None or little",
+ }
+ ],
+ "sap_points": 9,
+ "ending_sap": 71,
+ },
+ {
+ "measure": "TTZC",
+ "description": "Smart Thermostat",
+ "sap_points": 3,
+ "ending_sap": 74,
+ }
+ ]
+ },
+ {
+ "survey_key": "AIH001-SIMULATED-01",
+ "elmhurst_reference": "000020",
+ "starting_sap": None,
+ "recommended_measures": [
+ {
+ "measure": "Internal Wall Insulation",
+ "description": "100mm internal wall insulation",
+ "hlp": (22.35 * 3.24) + (22.13 * 2.53),
+ "sap_points": 8,
+ "ending_sap": 52,
+ },
+ {
+ "measure": "Cavity Wall Insulation",
+ "description": "CWI to rdSAP default standard",
+ "hlp": (2.68 * 2.39) + (5.93 * 2.63) + (6.13 * 2.39), # 1st & 2nd extension
+ "sap_points": 1,
+ "ending_sap": 53,
+ },
+ {
+ "measure": "Ventilation",
+ "description": "2x DMEV fans",
+ "sap_points": 0,
+ "ending_sap": 53,
+ },
+ {
+ "measure": "TTZC",
+ "description": "Smart Thermostat",
+ "sap_points": 3,
+ "ending_sap": 56,
+ },
+ {
+ "measure": "Solar PV",
+ "description": "1.6kWp Solar PV system",
+ "config": [
+ {
+ "size": "1.6W",
+ "orientation": "South-East",
+ "elavation": 30,
+ "overshading": "None or little",
+ }
+ ],
+ "sap_points": 6,
+ "ending_sap": 62
+ },
+ {
+ "measure": "Loft Insulation",
+ "description": "300mm loft insulation",
+ "floor_area": 63.59 + 12.31, # Based on area of main building and 1st extension
+ "sap_points": 8,
+ "ending_sap": 70,
+ "notes": "Loft is inaccessible from this unit - would need to be accessed from the other unit, "
+ "which is also owned by AIHA"
+ }
+ ],
+ "notes": "This was cloned from 80A. There is no existing data for 80B"
+ },
+ {
+ "survey_key": "AIH001-SIMULATED-05",
+ "starting_sap": 68,
+ "recommended_measures": [
+ {
+ "measure": "Loft Insulation",
+ "description": "300mm loft insulation",
+ "floor_area": 42.5,
+ "sap_points": 1,
+ "ending_sap": 69,
+ },
+ {
+ "measure": "Solar PV",
+ "description": "3.2kWp Solar PV system",
+ "config": [
+ {
+ "size": "3.2W",
+ "orientation": "North-East",
+ "elavation": 30,
+ "overshading": "None or little",
+ }
+ ],
+ "sap_points": 8,
+ "ending_sap": 77,
+ }
+ ]
+ }
+ ]
+
+ scaffolding_data = [
+ {
+ "number_of_floors": 2,
+ "price": 841,
+ },
+ {
+ "number_of_floors": 3,
+ "price": 1077,
+ }
+ ]
+
+ # TODO - Need an update cost for cylinder insulation
+ pricing_data = [
+ {'item': '80mm cylinder insulation', 'unit_price': 50, 'unit': 'unit'},
+ {'item': '100mm internal wall insulation', 'unit_price': 244.8, 'unit': 'hlp_m2'},
+ {'item': 'CWI to rdSAP default standard', 'unit_price': 14.21, 'unit': 'hlp_m2'},
+ {'item': 'Window draught proofing improvements', 'unit_price': 63, 'unit': 'window'},
+ {'item': '100mm flat roof insulation', 'unit_price': 195, 'unit': 'floor_m2'},
+ {'item': 'Switch to 24-hour tariff', 'unit_price': 0, 'unit': None},
+ {'item': 'Installation of double glazing', 'unit_price': 1074, 'unit': 'window'},
+ {'item': 'Ecoforest ecoAIR EVI 4-20 20kW air source heat pump (+TTZC)', 'unit_price': 21189 + 1200,
+ 'unit': 'unit'},
+ {'item': '100mm+ RIR insulation on all surfaces (ceiling u=0.16, walls u=0.3)', 'unit_price': 244.80,
+ 'unit': 'floor_m2'},
+ {'item': '300mm loft insulation', 'unit_price': 16.07, 'unit': 'floor_m2'},
+ {'item': 'Smart Thermostat', 'unit_price': 1200, 'unit': 'unit'},
+ {'item': '2x DMEV fans', 'unit_price': 1070, 'unit': 'unit'},
+ {'item': '1.6kWp Solar PV system', 'unit_price': 3040, 'unit': 'unit_needs_scaffolding'},
+ {'item': '2kWp Solar PV system', 'unit_price': 3201, 'unit': 'unit_needs_scaffolding'},
+ {'item': '2.4kWp Solar PV system', 'unit_price': 3363, 'unit': 'unit_needs_scaffolding'},
+ {'item': '3.2kWp Solar PV system', 'unit_price': 3686, 'unit': 'unit_needs_scaffolding'},
+ {'item': '4kWp Solar PV system', 'unit_price': 4009, 'unit': 'unit_needs_scaffolding'},
+ {'item': '5.6kWp Solar PV system', 'unit_price': 5015, 'unit': 'unit_needs_scaffolding'},
+ ]
+ pricing_data = pd.DataFrame(pricing_data)
+
+ for recommendation in recommended_measures:
+ property_data = measures_data[measures_data["survey_key"] == recommendation["survey_key"]].squeeze()
+ total_cost = 0
+
+ for measure in recommendation["recommended_measures"]:
+ measure_pricing = pricing_data[pricing_data["item"] == measure["description"]]
+ measure_unit = measure_pricing["unit"].values[0]
+
+ if measure_unit in ["unit", None]:
+ measure_cost = float(measure_pricing["unit_price"].values[0])
+ elif measure_unit == "unit_needs_scaffolding":
+ n_floors = property_data["number_of_floors"]
+ scaffolding_cost = [x for x in scaffolding_data if x["number_of_floors"] == n_floors][0]["price"]
+ measure_cost = float(measure_pricing["unit_price"].values[0]) + scaffolding_cost
+ elif measure_unit == "floor_m2":
+ measure_cost = float(measure_pricing["unit_price"].values[0]) * measure["floor_area"]
+ elif measure_unit == "hlp_m2":
+ measure_cost = float(measure_pricing["unit_price"].values[0]) * measure["hlp"]
+ elif measure_unit == "window":
+ measure_cost = float(measure_pricing["unit_price"].values[0]) * measure["n_windows"]
+ else:
+ raise Exception("Unknown unit type")
+
+ measure["Total Cost"] = measure_cost
+ total_cost += measure_cost
+
+ recommendation["total_cost"] = total_cost
+
+ # Step 1: Normalize the recommended_measures data into a DataFrame.
+ normalized_measures = []
+ for survey in recommended_measures:
+ survey_key = survey["survey_key"]
+ starting_sap = survey["starting_sap"]
+ total_cost = survey.get("total_cost", 0)
+
+ for measure in survey.get("recommended_measures", []):
+ # Include hlp and floor_area for each measure if available
+ hlp = measure.get("hlp", None)
+ floor_area = measure.get("floor_area", None)
+
+ normalized_measures.append({
+ "survey_key": survey_key,
+ "hlp": hlp,
+ "floor_area": floor_area,
+ "starting_sap": starting_sap,
+ "measure": measure["measure"],
+ "description": measure.get("description", ""),
+ "sap_points": measure.get("sap_points", 0),
+ "measure_cost": measure.get("Total Cost", 0),
+ "total_cost": total_cost
+ })
+
+ # Convert the normalized list into a DataFrame.
+ measures_df = pd.DataFrame(normalized_measures)
+
+ # Step 2: Pivot the measures_df to have a column for each measure type, using the description as values.
+ pivoted_measures = measures_df.pivot_table(
+ index="survey_key",
+ columns="measure",
+ values="description",
+ aggfunc=lambda x: ' '.join(x), # Concatenate descriptions if there are multiple entries.
+ fill_value=None
+ ).reset_index()
+
+ measures_columns = [x for x in pivoted_measures.columns if x not in ["survey_key"]]
+ # We add a "Cost of" column for each measure
+ for measure in measures_columns:
+ pivoted_measures[f"Cost of {measure}"] = None
+
+ pivoted_floor_area = measures_df.pivot_table(
+ index="survey_key",
+ columns="measure",
+ values="floor_area",
+ aggfunc="first" # Use 'first' since each measure should only appear once per survey_key
+ ).add_prefix("floor_area - ").reset_index()
+
+ pivoted_hlp = measures_df.pivot_table(
+ index="survey_key",
+ columns="measure",
+ values="hlp",
+ aggfunc="first"
+ ).add_prefix("hlp - ").reset_index()
+
+ # Merge hlp and floor_area data
+ pivoted_measures = pivoted_measures.merge(pivoted_hlp, on="survey_key", how="left")
+ pivoted_measures = pivoted_measures.merge(pivoted_floor_area, on="survey_key", how="left")
+
+ # Step 3: Calculate the total sap points and total cost for each survey.
+ totals = measures_df.groupby("survey_key").agg(
+ total_sap_points=("sap_points", "sum"),
+ ).reset_index()
+
+ # Merge total sap points into the pivoted measures.
+ pivoted_measures = pd.merge(pivoted_measures, totals, on="survey_key", how="left")
+ # pivoted_measures["Cost Contingency"] = pivoted_measures["total_cost_of_measures"] * CONTINGENCY_RATE
+ # pivoted_measures["Total Cost"] = pivoted_measures["total_cost_of_measures"] + pivoted_measures["Cost Contingency"]
+
+ # Step 4: Extract starting SAP for each survey key.
+ starting_sap_df = measures_df.drop_duplicates(subset=["survey_key"])[["survey_key", "starting_sap"]]
+
+ # Merge starting SAP back onto pivoted measures.
+ result_df = pd.merge(pivoted_measures, starting_sap_df, on="survey_key", how="left")
+
+ # Step 5: Calculate the ending SAP.
+ result_df["Ending SAP"] = result_df["starting_sap"] + result_df["total_sap_points"]
+ result_df["Ending EPC Rating"] = result_df["Ending SAP"].apply(sap_to_epc)
+
+ # Step 6: Merge the result with the measures_data to get the final DataFrame.
+ final_measures = measures_data.merge(
+ result_df, how="left", on="survey_key"
+ )
+
+ final_measures.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/Measures packages.csv")
+
+ # Store costs
+ pricing_data.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/Pricing data.csv")
+
+# if __name__ == "__main__":
+# main()
diff --git a/etl/customers/benyon/epc_data.py b/etl/customers/benyon/epc_data.py
new file mode 100644
index 00000000..9ba71f2f
--- /dev/null
+++ b/etl/customers/benyon/epc_data.py
@@ -0,0 +1,71 @@
+"""
+Rough script to get the EPC data for Benyon
+"""
+
+import pandas as pd
+import os
+from dotenv import load_dotenv
+from backend.SearchEpc import SearchEpc
+from asset_list.utils import get_data
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+asset_list = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Benyon Estate/List of All Properties ecl Grd Rents in "
+ "Alphabetical Order.xlsx",
+ header=1
+)
+asset_list.columns = ["tennancy", "landlord_id", "landlord_address"]
+# Get postcode as the last 2 parts of the address, split on space
+asset_list["postcode"] = asset_list["landlord_address"].apply(lambda x: x.split(" ")[-2] + " " + x.split(" ")[-1])
+
+asset_list["house_no"] = asset_list.apply(
+ lambda x: SearchEpc.get_house_number(address=x["landlord_address"], postcode=x["postcode"]), axis=1
+)
+
+epc_data, errors, no_epc = get_data(
+ df=asset_list,
+ manual_uprn_map={},
+ epc_auth_token=EPC_AUTH_TOKEN,
+ uprn_column=None,
+ fulladdress_column="landlord_address",
+ address1_column="house_no",
+ postcode_column="postcode",
+ property_type_column=None,
+ built_form_column=None,
+ epc_api_only=True,
+ row_id_name="landlord_id",
+)
+
+df = asset_list[asset_list["landlord_id"].isin(no_epc)]
+epc_df = pd.DataFrame(epc_data)
+epc_df["current-energy-rating"].value_counts()
+epc_df["property-type"].value_counts()
+epc_df["walls-description"].value_counts(normalize=True)
+
+asset_list = asset_list.merge(
+ epc_df[
+ [
+ "landlord_id", "current-energy-rating", "property-type", "total-floor-area", "roof-description",
+ "walls-description", "co2-emissions-current"
+ ]
+ ],
+ how="left",
+ left_on="landlord_id",
+ right_on="landlord_id"
+)
+asset_list.to_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Benyon Estate/asset_list.csv", index=False
+)
+
+asset_list_big = asset_list.merge(
+ epc_df,
+ how="left",
+ left_on="landlord_id",
+ right_on="landlord_id"
+)
+asset_list_big.to_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Benyon Estate/asset_list_full_data.csv",
+ index=False
+)
diff --git a/etl/customers/bromford/data_cleanup.py b/etl/customers/bromford/data_cleanup.py
new file mode 100644
index 00000000..45429523
--- /dev/null
+++ b/etl/customers/bromford/data_cleanup.py
@@ -0,0 +1,192 @@
+"""
+12th April 2025
+This script attempts to clean up the various pieces of data we have for Bromford, with the intention of producing a
+standardised asset list
+"""
+
+import pandas as pd
+
+# Step 1
+# The inspectons data is spread across three different files. We attempt to produce one finalised asset list, with
+# comprehensive inspections
+
+# Primary asset list
+asset_list = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Bromford Asset "
+ "List.xlsx",
+ sheet_name="Asset List"
+)
+
+#
+inspections_1 = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD "
+ "MDS.xlsx",
+ sheet_name="Data list"
+)
+inspections_1["Heating Type"] = (inspections_1["Heating Type"] + " " + inspections_1["Heating fuel"]).str.strip()
+
+inspections_2 = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD "
+ "MERLIN LANE.xlsx",
+ sheet_name="Report"
+)
+inspections_2["AssetTypeDesc"] = inspections_2["PropertyType"].str.split(" ").str[-1]
+inspections_2["PropTypeDesc"] = inspections_2["PropertyType"].str.split(" ").str[:-1].str.join(" ")
+
+inspections_3 = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Inspections/BROMFORD "
+ "SEVERN VALE - KLARKE.xlsx",
+ sheet_name="Asset report"
+)
+
+inspections_3["FullAddress"] = inspections_3["T1_Address1"] + ", " + inspections_3["T1_Address2"]
+
+# On inspections 3, we have multiple sheets which describe the heating
+heating_systems = []
+for sheet_name in [
+ "Storage Heaters", "No Heating", "Underfloor Heating", "Rointe Electric Heating", "Air Source Heating",
+ "Gas Central Heating", "Electric Boiler", "Oil Fired Central Heating",
+ "Communal Boilers", "Panel Heaters"
+]:
+ df = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme "
+ "Rebuild/Inspections/BROMFORD "
+ "SEVERN VALE - KLARKE.xlsx",
+ sheet_name=sheet_name
+ )
+ df = df[["UPRN"]]
+ df["Heating Type"] = sheet_name
+ heating_systems.append(df)
+
+heating_systems = pd.concat(heating_systems)
+# We have no clue which one is correct, we have some dupes
+heating_systems = heating_systems.drop_duplicates("UPRN")
+heating_systems = heating_systems.rename(columns={"UPRN": "Asset"})
+heating_systems["Asset"] = heating_systems["Asset"].astype(int)
+
+inspections_3 = inspections_3.merge(heating_systems, how="left", on="Asset")
+
+# Create a consolidated inspections sheet
+inspections = pd.concat(
+ [
+ inspections_1[["Asset", "Construction type", 'Heating Type', "WFT Findings", "Eligibility (Red/Yellow/Green)"]],
+ inspections_2[["Asset", "Construction type", "WFT Findings", "Eligibility (Red/Yellow/Green)"]],
+ inspections_3[["Asset", 'Heating Type', "WFT Findings", "Eligibility (Red/Yellow/Green)"]],
+ ]
+)
+
+inspections_address_data = pd.concat(
+ [
+ inspections_1[
+ ["Asset", "FullAddress", "PostCode", "ConYear", "Beds", "AssetTypeDesc", "PropTypeDesc", 'ManAreaDesc', ]
+ ],
+ inspections_2[
+ ['Asset', 'FullAddress', 'AccomType', "AssetTypeDesc", "PropTypeDesc", 'ConYear', 'Postcode']
+ ].rename(columns={"Postcode": "PostCode"}),
+ inspections_3[
+ ['Asset', "FullAddress", 'T1_Postcode', 'T1_Build Year', 'T1_AssetType']
+ ].rename(
+ columns={"T1_Postcode": "PostCode", "T1_Build Year": "ConYear", "T1_AssetType": "AssetTypeDesc"}
+ ),
+ ]
+)
+
+# Remove some error values
+inspections = inspections[~inspections["Asset"].isin(
+ [
+ "They're all green partial fill they're all green this",
+ "South Staffordshire District Council",
+ 'Blk Milton Crt F9-10, Perton, Wolverhampton'
+ ]
+)]
+
+inspections["Asset"] = inspections["Asset"].astype(str)
+asset_list["Asset"] = asset_list["Asset"].astype(str)
+inspections_address_data["Asset"] = inspections_address_data["Asset"].astype(str)
+inspections['WFT Findings'] = inspections['WFT Findings'].replace(r'^\s*$', pd.NA, regex=True)
+
+# We have some cases where the inspetions data has dupes on Asset (the ID column). We take the instance that is
+# populated
+inspections = inspections.sort_values(by='WFT Findings', na_position='last')
+inspections = inspections.drop_duplicates(subset='Asset', keep='first')
+
+# We have dupes in the asset list
+asset_list = asset_list.drop_duplicates("Asset")
+
+# Merge on
+missed_asset_ids = inspections[
+ ~inspections["Asset"].isin(asset_list["Asset"].values)
+]["Asset"].values
+
+missed_assets = inspections_address_data[
+ inspections_address_data["Asset"].isin(missed_asset_ids)
+]
+missed_assets = missed_assets.drop_duplicates("Asset")
+
+# We produce a larger asset list
+asset_list = pd.concat([asset_list, missed_assets])
+
+asset_list = asset_list.merge(
+ inspections, how="left", on="Asset"
+)
+asset_list["WFT Findings"] = asset_list["WFT Findings"].fillna("No Inspections Note")
+
+# Store
+# asset_list.to_excel(
+# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared "
+# "data/asset_list.xlsx"
+# )
+
+# We now prepare outcomes into a single file
+pv_outcomes = pd.read_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Bromford PV "
+ "Outcomes.csv",
+ encoding='cp1252'
+)
+pv_outcomes["measure_type"] = "solar"
+
+other_outcomes = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/(Bromford) "
+ "15.04.2024.xlsx",
+ sheet_name="ECO4 & GBIS",
+ header=1
+)
+other_outcomes["measure_type"] = "cwi"
+
+combined_outcomes = pd.concat(
+ [
+ other_outcomes[["NO", "ADDRESS", "POSTCODE", "WEEK COMMENCING", "OUTCOMES", "NOTES"]].rename(
+ columns={
+ "NO": "No", "ADDRESS": "Address", "POSTCODE": "Postcode", "WEEK COMMENCING": "Week Commencing",
+ "OUTCOMES": "Outcome", "NOTES": "Notes"
+ }
+ ),
+ pv_outcomes[['No', 'Address', 'Postcode', "Week Commencing", "Outcome", "Notes"]]
+ ]
+)
+
+# Store
+# combined_outcomes.to_excel(
+# "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/Prepared "
+# "data/outcomes.xlsx"
+# )
+
+# Submissions sheet -
+eco3_submissions = pd.read_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 3 Submissions.csv",
+ encoding='cp1252'
+)
+# Get rid of the unnamed columns
+unnamed_columns = [c for c in eco3_submissions.columns if "Unnamed: " in c]
+eco3_submissions = eco3_submissions.drop(columns=unnamed_columns)
+# Store
+eco3_submissions.to_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 3 submissions.csv",
+ index=False
+)
+
+eco4_submissions = pd.read_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/Apr 2025 Programme Rebuild/ECO 4 submissions.csv",
+)
+
+same_cols = [c for c in eco4_submissions.columns if c in eco3_submissions.columns]
diff --git a/etl/customers/cambridge/remote_assessment.py b/etl/customers/cambridge/remote_assessment.py
new file mode 100644
index 00000000..dc5beff5
--- /dev/null
+++ b/etl/customers/cambridge/remote_assessment.py
@@ -0,0 +1,138 @@
+import os
+import time
+
+from tqdm import tqdm
+import pandas as pd
+from dotenv import load_dotenv
+from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
+from backend.SearchEpc import SearchEpc
+from utils.s3 import save_csv_to_s3
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+USER_ID = 8
+PORTFOLIO_ID = 122
+
+
+def app():
+ asset_list = [
+ {
+ "address": "12 Church Lane", "postcode": "CB23 8AF", "uprn": 100090136018,
+ "property_type": "House", "built-form": "Semi-Detached"
+ },
+ {
+ "address": "21 High Street", "postcode": "CB23 8AB", "uprn": 100090144815
+ },
+ {
+ "address": "22 High Street", "postcode": "CB23 8AB", "uprn": 100090144816
+ },
+ {
+ "address": "5 Bunkers Hill", "postcode": "CB3 0LY", "uprn": 10008078615
+ },
+ {
+ "address": "6 Bunkers Hill", "postcode": "CB3 0LY", "uprn": 10008078616
+ },
+ {
+ "address": "7 Bunkers Hill", "postcode": "CB3 0LY", "uprn": 10008078617
+ },
+ {
+ "address": "32 George Nuttall Close", "postcode": "CB4 1YE", "uprn": 200004200075
+ },
+ {
+ "address": "33 George Nuttall Close", "postcode": "CB4 1YE", "uprn": 200004200076
+ },
+ {
+ "address": "35 George Nuttall Close", "postcode": "CB4 1YE", "uprn": 200004200078
+ },
+ {
+ "address": "36 George Nuttall Close", "postcode": "CB4 1YE", "uprn": 200004200079
+ }
+ ]
+ asset_list = pd.DataFrame(asset_list)
+
+ valuations_data = [
+ {'uprn': 100090136018, "valuation": 586_000},
+ {'uprn': 100090144815, "valuation": 446_000},
+ {'uprn': 100090144816, "valuation": 448_000},
+ {'uprn': 10008078615, "valuation": 763_000},
+ {'uprn': 10008078616, "valuation": 616_000},
+ {'uprn': 10008078617, "valuation": 593_000},
+ {'uprn': 200004200075, "valuation": 450_000},
+ {'uprn': 200004200076, "valuation": 457_000},
+ {'uprn': 200004200078, "valuation": 304_000},
+ {'uprn': 200004200079, "valuation": 313_000}
+ ]
+
+ # Pull the additional data
+ extracted_data = []
+ for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
+ add1 = home["address"]
+ pc = home["postcode"]
+ # Retrieve the EPC data
+ epc_searcher = SearchEpc(
+ address1=add1,
+ postcode=pc, uprn=home["uprn"], auth_token=EPC_AUTH_TOKEN, os_api_key=""
+ )
+ epc_searcher.find_property(skip_os=True)
+ if epc_searcher.newest_epc is None:
+ continue
+
+ find_epc_searcher = RetrieveFindMyEpc(address=epc_searcher.newest_epc["address1"],
+ postcode=epc_searcher.newest_epc["postcode"])
+ find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+ time.sleep(0.5)
+ # We need uprn
+
+ extracted_data.append(
+ {
+ "uprn": home["uprn"],
+ **find_epc_data,
+ }
+ )
+
+ non_invasive_recommendations = [
+ {
+ "uprn": r["uprn"],
+ "recommendations": r["recommendations"]
+ } for r in extracted_data
+ ]
+
+ filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv"
+ save_csv_to_s3(
+ dataframe=pd.DataFrame(asset_list),
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=filename
+ )
+
+ # Store the non-invasive recommendations in s3
+ non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv"
+ save_csv_to_s3(
+ dataframe=pd.DataFrame(non_invasive_recommendations),
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=non_invasive_recommendations_filename
+ )
+
+ # Store the valuations data in s3
+ valuations_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuations.csv"
+ save_csv_to_s3(
+ dataframe=pd.DataFrame(valuations_data),
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=valuations_filename
+ )
+
+ body = {
+ "portfolio_id": str(PORTFOLIO_ID),
+ "housing_type": "Private",
+ "goal": "Increasing EPC",
+ "goal_value": "B",
+ "trigger_file_path": filename,
+ "already_installed_file_path": "",
+ "patches_file_path": "",
+ "non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
+ "valuation_file_path": valuations_filename,
+ "scenario_name": "Wave 3 Packages",
+ "multi_plan": True,
+ "budget": None,
+ "exclusions": []
+ }
+ print(body)
diff --git a/etl/customers/connells/pilot_remote_assessments.py b/etl/customers/connells/pilot_remote_assessments.py
new file mode 100644
index 00000000..799bd805
--- /dev/null
+++ b/etl/customers/connells/pilot_remote_assessments.py
@@ -0,0 +1,108 @@
+import os
+import time
+
+from tqdm import tqdm
+import pandas as pd
+from dotenv import load_dotenv
+from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
+from backend.SearchEpc import SearchEpc
+from utils.s3 import save_csv_to_s3
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+USER_ID = 8
+PORTFOLIO_ID = 123
+
+
+def app():
+ asset_list = [
+ {"address": "1 Raven Crescent", "postcode": "WV11 2EX", "uprn": 100071188496},
+
+ {"address": "13 Bayliss Avenue", "postcode": "WV11 2EX", "uprn": 100071136271},
+
+ {"address": "30 Southbourne Road", "postcode": "WV10 6ET", "uprn": 100071194376},
+
+ {"address": "96 Marsh Lane", "postcode": "WV10 6RX", "uprn": 100071176297},
+ ]
+ asset_list = pd.DataFrame(asset_list)
+
+ valuations_data = [
+ {'uprn': 100071188496, "valuation": 175_000},
+ {'uprn': 100071136271, "valuation": 183_000},
+ {'uprn': 100071194376, "valuation": 221_000},
+ {'uprn': 100071176297, "valuation": 208_000},
+ ]
+
+ # Pull the additional data
+ extracted_data = []
+ for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
+ add1 = home["address"]
+ pc = home["postcode"]
+ # Retrieve the EPC data
+ epc_searcher = SearchEpc(
+ address1=add1,
+ postcode=pc, uprn=home["uprn"], auth_token=EPC_AUTH_TOKEN, os_api_key=""
+ )
+ epc_searcher.find_property(skip_os=True)
+ if epc_searcher.newest_epc is None:
+ continue
+
+ find_epc_searcher = RetrieveFindMyEpc(address=epc_searcher.newest_epc["address1"],
+ postcode=epc_searcher.newest_epc["postcode"])
+ find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+ time.sleep(0.5)
+ # We need uprn
+
+ extracted_data.append(
+ {
+ "uprn": home["uprn"],
+ **find_epc_data,
+ }
+ )
+
+ non_invasive_recommendations = [
+ {
+ "uprn": r["uprn"],
+ "recommendations": r["recommendations"]
+ } for r in extracted_data
+ ]
+
+ filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv"
+ save_csv_to_s3(
+ dataframe=pd.DataFrame(asset_list),
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=filename
+ )
+
+ # Store the non-invasive recommendations in s3
+ non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv"
+ save_csv_to_s3(
+ dataframe=pd.DataFrame(non_invasive_recommendations),
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=non_invasive_recommendations_filename
+ )
+
+ # Store the valuations data in s3
+ valuations_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuations.csv"
+ save_csv_to_s3(
+ dataframe=pd.DataFrame(valuations_data),
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=valuations_filename
+ )
+
+ body = {
+ "portfolio_id": str(PORTFOLIO_ID),
+ "housing_type": "Private",
+ "goal": "Increasing EPC",
+ "goal_value": "B",
+ "trigger_file_path": filename,
+ "already_installed_file_path": "",
+ "patches_file_path": "",
+ "non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
+ "valuation_file_path": valuations_filename,
+ "scenario_name": "Wave 3 Packages",
+ "multi_plan": True,
+ "budget": None,
+ "exclusions": []
+ }
+ print(body)
diff --git a/etl/customers/cottons/parse_pdf_asset_list.py b/etl/customers/cottons/parse_pdf_asset_list.py
new file mode 100644
index 00000000..7d442e97
--- /dev/null
+++ b/etl/customers/cottons/parse_pdf_asset_list.py
@@ -0,0 +1,64 @@
+import re
+import pandas as pd
+from PyPDF2 import PdfReader
+
+# Paths to the uploaded files
+file_paths = [
+ "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged).pdf",
+ "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 2.pdf",
+ "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 3.pdf",
+ "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 4.pdf",
+ "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 5.pdf",
+ "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 6.pdf"
+]
+
+
+# Function to extract text from PDFs
+def extract_text_from_pdf_with_pypdf2(file_path):
+ text = ""
+ reader = PdfReader(file_path)
+ for page in reader.pages:
+ text += page.extract_text()
+ return text
+
+
+# Initialize a list to hold all parsed data
+all_parsed_data = []
+
+# Process each PDF individually
+for i, path in enumerate(file_paths):
+ # Extract text from the PDF
+ extracted_text = extract_text_from_pdf_with_pypdf2(path)
+
+ # Step 1: Remove titles and repeated headers
+ cleaned_text = re.sub(r"Managed Property Report as at \d+ \w+ \d+", "", extracted_text)
+ cleaned_text = re.sub(r"Code Property Address Management Type", "", cleaned_text)
+
+ # Step 2: Extract rows ending with "Managed"
+ rows = re.findall(r".*?Managed", cleaned_text)
+
+ # Step 3: Parse rows into structured data
+ parsed_data = []
+ for row in rows:
+ match = re.match(r"(\S+)\s+(.+?)\s+Managed", row.strip())
+ if match:
+ code = match.group(1).strip()
+ address = match.group(2).strip()
+ parsed_data.append((code, address, "Managed"))
+
+ # Append parsed data to the global list
+ all_parsed_data.extend(parsed_data)
+
+ # Provide feedback for debugging
+ print(f"File {i + 1} processed: {len(parsed_data)} rows")
+
+# Step 4: Create a unified DataFrame
+final_df = pd.DataFrame(all_parsed_data, columns=["Code", "Property Address", "Management Type"])
+
+# Step 5: Save the unified DataFrame to an Excel file
+final_output_file_path = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unified_Managed_Properties_List.xlsx"
+final_df.to_excel(final_output_file_path, index=False)
+
+# Provide feedback
+print(f"All files processed and combined. Total rows: {len(final_df)}")
+print(f"Unified file saved to: {final_output_file_path}")
diff --git a/etl/customers/cottons/prep_asset_list.py b/etl/customers/cottons/prep_asset_list.py
new file mode 100644
index 00000000..db7c6583
--- /dev/null
+++ b/etl/customers/cottons/prep_asset_list.py
@@ -0,0 +1,15 @@
+import pandas as pd
+
+df = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List.xlsx"
+)
+
+# split up the address on commas. First section is address1, last seciton is postcode
+df["address1"] = df["Property Address"].apply(lambda x: x.split(",")[0].strip())
+df["postcode"] = df["Property Address"].apply(lambda x: x.split(",")[-1].strip())
+
+# Re-save
+df.to_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List.xlsx",
+ index=False,
+)
diff --git a/etl/customers/cottons/remote_assessments.py b/etl/customers/cottons/remote_assessments.py
new file mode 100644
index 00000000..7855a1a9
--- /dev/null
+++ b/etl/customers/cottons/remote_assessments.py
@@ -0,0 +1,124 @@
+import os
+import time
+
+from tqdm import tqdm
+import pandas as pd
+from dotenv import load_dotenv
+from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
+from backend.SearchEpc import SearchEpc
+from utils.s3 import save_csv_to_s3
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+USER_ID = 8
+PORTFOLIO_ID = 121
+
+
+def app():
+ """
+ Prepares the inputs to produce the remote assessments for Cottons
+ :return:
+ """
+
+ # Read in the asset list
+ cottons_asset_list = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List EPC Data Pull with "
+ "valuations.xlsx"
+ )
+ # A number are missing EPCs due to the space in the postcode
+ # Breakdowns:
+ # C 119
+ # D 106
+ # E 26
+ # B 5
+ #
+ # Take the EPC D/E properties
+ asset_list = cottons_asset_list[
+ cottons_asset_list["EPC rating on register"].isin(["D", "E"])
+ ]
+ asset_list = asset_list.reset_index(drop=True)
+ asset_list["row_id"] = asset_list.index
+ asset_list["uprn"] = asset_list["uprn"].astype(int)
+
+ extracted_data = []
+ model_asset_list = []
+ for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
+ add1 = home["address1"]
+ pc = home["postcode"]
+ # Retrieve the EPC data
+ epc_searcher = SearchEpc(
+ address1=add1,
+ postcode=pc, uprn=home["uprn"], auth_token=EPC_AUTH_TOKEN, os_api_key=""
+ )
+ epc_searcher.find_property(skip_os=True)
+
+ find_epc_searcher = RetrieveFindMyEpc(address=epc_searcher.newest_epc["address1"],
+ postcode=epc_searcher.newest_epc["postcode"])
+ find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+ time.sleep(0.5)
+ # We need uprn
+
+ extracted_data.append(
+ {
+ "uprn": home["uprn"],
+ **find_epc_data,
+ }
+ )
+
+ model_asset_list.append(
+ {
+ "uprn": home["uprn"],
+ "address": epc_searcher.newest_epc["address1"],
+ "postcode": epc_searcher.newest_epc["postcode"],
+ }
+ )
+
+ non_invasive_recommendations = [
+ {
+ "uprn": r["uprn"],
+ "recommendations": r["recommendations"]
+ } for r in extracted_data
+ ]
+
+ valuations_data = asset_list[["uprn", "Zoopla Valuation"]].copy().rename(columns={"Zoopla Valuation": "valuation"})
+ valuations_data = valuations_data[~pd.isnull(valuations_data["valuation"])]
+
+ filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv"
+ save_csv_to_s3(
+ dataframe=pd.DataFrame(model_asset_list),
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=filename
+ )
+
+ # Store the non-invasive recommendations in s3
+ non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv"
+ save_csv_to_s3(
+ dataframe=pd.DataFrame(non_invasive_recommendations),
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=non_invasive_recommendations_filename
+ )
+
+ # Store the valuations data in s3
+ valuations_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuations.csv"
+ save_csv_to_s3(
+ dataframe=valuations_data,
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=valuations_filename
+ )
+
+ body = {
+ "portfolio_id": str(PORTFOLIO_ID),
+ "housing_type": "Social",
+ "goal": "Increasing EPC",
+ "goal_value": "C",
+ "trigger_file_path": filename,
+ "already_installed_file_path": "",
+ "patches_file_path": "",
+ "non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
+ "valuation_file_path": valuations_filename,
+ "scenario_name": "Wave 3 Packages",
+ "multi_plan": True,
+ "budget": None,
+ "exclusions": ['air_source_heat_pump', 'boiler_upgrade', 'floor_insulation']
+ }
+ print(body)
diff --git a/etl/customers/gla/hug_postcodes.py b/etl/customers/gla/hug_postcodes.py
new file mode 100644
index 00000000..fc89b6f2
--- /dev/null
+++ b/etl/customers/gla/hug_postcodes.py
@@ -0,0 +1,77 @@
+import inspect
+import pandas as pd
+from pathlib import Path
+from tqdm import tqdm
+from etl.epc.settings import EARLIEST_EPC_DATE
+from etl.spatial.OpenUprnClient import OpenUprnClient
+
+src_file_path = inspect.getfile(lambda: None)
+
+EPC_DIRECTORY = Path("/Users/khalimconn-kowlessar/Downloads/all-domestic-certificates")
+epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()]
+
+aggregation = []
+for directory in tqdm(epc_directories):
+ data = pd.read_csv(directory / "certificates.csv", low_memory=False)
+ # Rename the columns to the same format as the api returns
+ data.columns = [c.replace("_", "-").lower() for c in data.columns]
+
+ data = data[data["posttown"].str.contains("London", case=False, na=False)]
+ if data.empty:
+ continue
+ # Take just date before the date threshold
+ data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE]
+
+ data = data[~pd.isnull(data["uprn"])]
+ data["uprn"] = data["uprn"].astype(int)
+ # Take just the newest EPC per uprn, based on lodgement-date
+ data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn")
+ # Take EPC D and below
+ data = data[data["current-energy-rating"].isin(["D", "E", "F", "G"])]
+ data["postal_region"] = data["postcode"].str.split(" ").str[0]
+
+ # Take homes that don't have a gas boiler
+ off_gas = data[~data["main-fuel"].str.contains("mains gas", case=False, na=False)]
+
+ if off_gas.empty:
+ continue
+
+ # Remote properties with conservation area issues
+ uprns = off_gas["uprn"].unique()
+ # Get data
+ ca_data = OpenUprnClient.get_spatial_data(uprns, "retrofit-data-dev")
+ off_gas = off_gas.merge(
+ ca_data[["UPRN", "conservation_status", "is_listed_building", "is_heritage_building"]].rename(
+ columns={"UPRN": "uprn"}
+ ),
+ how="left",
+ on="uprn",
+ )
+ # Remove any restricted units
+ off_gas = off_gas[
+ (off_gas["conservation_status"] != True)
+ & (off_gas["is_listed_building"] != True)
+ & (off_gas["is_heritage_building"] != True)
+ ]
+
+ off_gas = off_gas[
+ off_gas["tenure"].isin(["rental (private)", "Rented (private)", "owner-occupied", "Owner-occupied"])
+ ]
+
+ region_summary = off_gas.groupby("postal_region").size().reset_index(name="count")
+
+ aggregation.append(region_summary)
+
+postal_region_aggregation = pd.concat(aggregation)
+# Re-aggregate
+postal_region_aggregation = postal_region_aggregation.groupby("postal_region")["count"].sum().reset_index()
+
+postal_region_aggregation = postal_region_aggregation.sort_values("count", ascending=False)
+postal_region_aggregation = postal_region_aggregation.rename(
+ columns={"postal_region": "Postcode Region", "count": "Number of Homes"}
+)
+postal_region_aggregation.to_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/GLA/Off Gas EPC D-G Postal Regions - without conservation "
+ "area.xlsx",
+ index=False
+)
diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py
new file mode 100644
index 00000000..0bf6eb18
--- /dev/null
+++ b/etl/customers/ksquared/Wave3 Modelling.py
@@ -0,0 +1,425 @@
+import os
+import time
+import re
+
+from etl.epc.settings import EARLIEST_EPC_DATE
+from dotenv import load_dotenv
+from tqdm import tqdm
+import pandas as pd
+import numpy as np
+from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
+from etl.spatial.OpenUprnClient import OpenUprnClient
+from backend.SearchEpc import SearchEpc
+from utils.s3 import save_csv_to_s3
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+USER_ID = 8
+PORTFOLIO_ID = 117
+CAHA_PORTFOLIO_ID = 118
+
+
+def hornsey():
+ """
+ This script prepares the asset lists for the additional housing associations, CAHA and Hornsey Housing Trust,
+ that are forming a consortium led by AIHA
+ :return:
+ """
+
+ hornsey_asset_list = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/SHDF - Template - EOI - Hornsey Housing "
+ "Trust.xlsx",
+ sheet_name="Ksquared-All units information",
+ header=3
+ )
+
+ # We don't need the first row
+ hornsey_asset_list = hornsey_asset_list.iloc[1:]
+ # Fill NA values with empty strings
+ hornsey_asset_list = hornsey_asset_list.fillna("")
+ hornsey_asset_list["Address letter or number"] = hornsey_asset_list["Address letter or number"].astype(
+ str
+ ).str.strip()
+ hornsey_asset_list["Postcode"] = hornsey_asset_list["Postcode"].astype(str).str.strip()
+ hornsey_asset_list["Street address"] = hornsey_asset_list["Street address"].astype(str).str.strip()
+ # Replace double spaces
+ for col in ["Address letter or number", "Street address", "Postcode"]:
+ hornsey_asset_list[col] = hornsey_asset_list[col].str.replace(" ", " ")
+
+ hornsey_asset_list = hornsey_asset_list[hornsey_asset_list["Address letter or number"] != ""]
+
+ hornsey_asset_list["Wall Type Cleaned"] = np.where(
+ "Cavity" in hornsey_asset_list["Wall type"],
+ "Cavity",
+ "Solid"
+ )
+
+ missed_uprns = {
+ "Flat 13A Stowell House": 100021213098,
+ "Flat 24 Stowell House": 100021213110,
+ "Flat 1 36 Haringey Park": None
+ }
+ extracted_data = []
+ asset_list = []
+ hornsey_asset_list["row_id"] = hornsey_asset_list.index
+ for _, home in tqdm(hornsey_asset_list.iterrows(), total=len(hornsey_asset_list)):
+
+ if home["Address letter or number"] == "Flat 1 36 Haringey Park":
+ continue
+
+ # Some properties do not have an epc
+ if not home["Energy starting band (EPC)"]:
+ asset_list.append(
+ {
+ "uprn": missed_uprns[home["Address letter or number"]],
+ "address": home["Address letter or number"],
+ "postcode": home["Postcode"],
+ "property_type": "Flat", # They're all flats
+ }
+ )
+ continue
+
+ unit_number = home["Address letter or number"]
+ street = home["Street address"]
+ postcode = home["Postcode"]
+ address = ", ".join([x for x in [unit_number, street] if x])
+ find_epc_searcher = RetrieveFindMyEpc(address=address, postcode=postcode)
+ find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+ time.sleep(0.5)
+ # We need uprn
+ searcher = SearchEpc(
+ address1=address,
+ postcode=postcode,
+ auth_token=EPC_AUTH_TOKEN,
+ os_api_key="",
+ full_address=address,
+ )
+ searcher.find_property(skip_os=True)
+ newest_epc = searcher.newest_epc
+ if newest_epc["current-energy-efficiency"] != home["Energy starting band (EPC)"].split("-")[1]:
+ raise Exception("Something went wrong with the EPC data")
+
+ extracted_data.append(
+ {
+ "uprn": newest_epc["uprn"],
+ **find_epc_data,
+ "hotwater-description": newest_epc["hotwater-description"],
+ }
+ )
+
+ asset_list.append(
+ {
+ "uprn": newest_epc["uprn"],
+ "row_id": home["row_id"],
+ "address": home["Address letter or number"],
+ "postcode": home["Postcode"],
+ "property_type": "Flat", # They're all flats
+ }
+ )
+
+ # Get conservation area data
+ # uprns = [x["uprn"] for x in extracted_data]
+ # conservation_area_data = OpenUprnClient.get_spatial_data(uprns, "retrofit-data-dev")
+ #
+ # addresses = pd.DataFrame(asset_list)
+ # addresses["uprn"] = addresses["uprn"].astype(int)
+ # conservation_area_df = conservation_area_data.merge(addresses, how="left", right_on="uprn", left_on="UPRN")
+ # conservation_area_df.to_csv(
+ # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/hornsey_conservation_area_data.csv"
+ # )
+
+ # We format the extracted data so that is has the same structure as non-intrusive recommendations
+ # We then get the UPRNs and create the asset list
+
+ non_invasive_recommendations = [
+ {
+ "uprn": r["uprn"],
+ "recommendations": r["recommendations"]
+ } for r in extracted_data
+ ]
+ for r in non_invasive_recommendations:
+ new_recommendations = []
+ extracted = [r for r in extracted_data if r["uprn"] == r["uprn"]][0]
+ for rec in r["recommendations"]:
+ if extracted["hotwater-description"] == "Gas boiler/circulator, no cylinder thermostat":
+ if rec["type"] in ["hot_water_tank_insulation", "cylinder_thermostat"]:
+ continue
+ rec["survey"] = False
+ new_recommendations.append(rec)
+ r["recommendations"] = new_recommendations
+
+ # Store the asset list in s3
+ filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv"
+ save_csv_to_s3(
+ dataframe=pd.DataFrame(asset_list),
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=filename
+ )
+
+ # Store the non-invasive recommendations in s3
+ non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv"
+ save_csv_to_s3(
+ dataframe=pd.DataFrame(non_invasive_recommendations),
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=non_invasive_recommendations_filename
+ )
+
+ body = {
+ "portfolio_id": str(PORTFOLIO_ID),
+ "housing_type": "Social",
+ "goal": "Increasing EPC",
+ "goal_value": "C",
+ "trigger_file_path": filename,
+ "already_installed_file_path": "",
+ "patches_file_path": "",
+ "non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
+ "valuation_file_path": "",
+ "scenario_name": "Wave 3 Packages",
+ "multi_plan": True,
+ "budget": None,
+ "exclusions": ["boiler_upgrade"]
+ }
+ print(body)
+
+
+def caha():
+ caha_asset_list = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/Copy of AIHA - WHSHF Wave 3 bid - Consortium "
+ "member properties - CAHA.xlsx",
+ sheet_name="Ksquared-All units information",
+ header=3
+ )
+
+ caha_asset_list = caha_asset_list.iloc[1:]
+ # Fill NA values with empty strings
+ caha_asset_list = caha_asset_list.fillna("")
+ caha_asset_list["Address letter or number"] = caha_asset_list["Address letter or number"].astype(
+ str
+ ).str.strip()
+
+ # We Add POstcode as it wasn't populated - split on space and take the last two entries and re-concatenate on space
+ caha_asset_list["Street address"] = caha_asset_list["Street address"].str.strip()
+ caha_asset_list["Postcode"] = caha_asset_list["Street address"].str.split(" ").str[-2:].str.join(" ")
+ # Take just the columns we need
+ caha_asset_list = caha_asset_list[["Address letter or number", "Street address", "Postcode"]]
+
+ for col in ["Address letter or number", "Street address", "Postcode"]:
+ caha_asset_list[col] = caha_asset_list[col].str.replace(" ", " ")
+
+ # Pull the data from find my epc
+ remap = {
+ "Flat A, 50 Talbot Road N6 4QP": "50a Talbot Road",
+ "Flat A, 51 First Avenue EN1 1BN": "51a, First Avenue",
+ "Flat B, 51 First Avenue EN1 1BN": "51b, First Avenue"
+ }
+
+ def remap_address(address):
+ # Match patterns like 'Flat A, 30 Grove Park Road'
+ match = re.match(r'Flat (\w), (\d+) (.+)', address)
+ if match:
+ flat_letter = match.group(1) # e.g., 'A'
+ number = match.group(2) # e.g., '30'
+ rest_of_address = match.group(3) # e.g., 'Grove Park Road'
+
+ # Format the new address as '30A Grove Park Road'
+ return f"{number}{flat_letter} {rest_of_address}"
+
+ # If pattern doesn't match, return original address
+ return address
+
+ caha_asset_list["row_id"] = caha_asset_list.index
+
+ extracted_data = []
+ asset_list = []
+ for _, home in tqdm(caha_asset_list.iterrows(), total=len(caha_asset_list)):
+ if home["Street address"] == "35 Stanford road N11 3HY" and home["Address letter or number"] == "":
+ continue
+
+ if home["Street address"] == "29 Victoria Avenue N3 1BD" and home["Address letter or number"] == "":
+ continue
+
+ if home["Street address"] == "11 Victoria Avenue N3 1BD" and home["Address letter or number"] == "Flat A":
+ continue
+
+ if home["Street address"] == "11 Victoria Avenue N3 1BD" and home["Address letter or number"] == "Flat C":
+ continue
+
+ if home["Street address"] == "10 Forest Gardens N17 6XA" and home["Address letter or number"] == "Flat C":
+ continue
+
+ if home["Street address"] == "219 Cann Hall Road E11 3NJ" and home["Address letter or number"] == "Flat B":
+ continue
+
+ unit_number = home["Address letter or number"]
+ street = home["Street address"]
+ postcode = home["Postcode"]
+ address = ", ".join([x for x in [unit_number, street] if x])
+ address = remap.get(address, address)
+ address = address.replace(postcode, "").strip()
+ if "Victoria Avenue" not in address:
+ address = remap_address(address)
+
+ find_epc_searcher = RetrieveFindMyEpc(address=address, postcode=postcode)
+ find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data(sap_2012_date=EARLIEST_EPC_DATE)
+ time.sleep(0.5)
+ # We need uprn
+ searcher = SearchEpc(
+ address1=address,
+ postcode=postcode,
+ auth_token=EPC_AUTH_TOKEN,
+ os_api_key="",
+ full_address=address,
+ )
+ searcher.find_property(skip_os=True)
+ newest_epc = searcher.newest_epc
+
+ uprn = newest_epc["uprn"]
+ if address in ["Flat D, 11 Victoria Avenue", "Flat B, 11 Victoria Avenue"]:
+ uprn = None
+
+ extracted_data.append(
+ {
+ "uprn": uprn,
+ **find_epc_data,
+ }
+ )
+
+ asset_list.append(
+ {
+ "row_id": home["row_id"],
+ "uprn": uprn,
+ "address": address,
+ "postcode": home["Postcode"],
+ "property_type": newest_epc["property-type"],
+ "wall_type": newest_epc["walls-description"],
+ "built_form": newest_epc["built-form"],
+ "flat_storey_count": newest_epc['flat-storey-count'],
+ }
+ )
+
+ # Missing row ids
+ missed = [r for r in caha_asset_list["row_id"].tolist() if r not in [x["row_id"] for x in asset_list]]
+
+ no_data = [x for x in asset_list if x["uprn"] in [None, ""]]
+ no_data = pd.DataFrame(no_data)
+
+ # Get conservation area data
+ uprns = [x["uprn"] for x in extracted_data if x["uprn"] not in ["", None]]
+ conservation_area_data = OpenUprnClient.get_spatial_data([36284], "retrofit-data-dev")
+
+ addresses = pd.DataFrame(asset_list)
+ addresses["uprn"] = addresses["uprn"].astype(str)
+ conservation_area_data["UPRN"] = conservation_area_data["UPRN"].astype(str)
+ conservation_area_df = conservation_area_data.merge(addresses, how="left", right_on="uprn", left_on="UPRN")
+ conservation_area_df.to_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/AIHA/caha_conservation_area_data.csv"
+ )
+
+ non_invasive_recommendations = [
+ {
+ "uprn": r["uprn"],
+ "recommendations": r["recommendations"]
+ } for r in extracted_data
+ ]
+ # for r in non_invasive_recommendations:
+ # new_recommendations = []
+ # extracted = [r for r in extracted_data if r["uprn"] == r["uprn"]][0]
+ # for rec in r["recommendations"]:
+ # if extracted["hotwater-description"] == "Gas boiler/circulator, no cylinder thermostat":
+ # if rec["type"] in ["hot_water_tank_insulation", "cylinder_thermostat"]:
+ # continue
+ # rec["survey"] = False
+ # new_recommendations.append(rec)
+ # r["recommendations"] = new_recommendations
+
+ # We model the two properties separately
+ asset_list = pd.DataFrame(asset_list)
+ # Drop Flat D, 11 Victoria Avenue
+ asset_list1 = asset_list[asset_list["address"] != "Flat D, 11 Victoria Avenue"]
+ asset_list2 = asset_list[asset_list["address"] == "Flat D, 11 Victoria Avenue"]
+
+ # Store the asset list in s3
+ filename = f"{USER_ID}/{CAHA_PORTFOLIO_ID}/asset_list1.csv"
+ save_csv_to_s3(
+ dataframe=asset_list1,
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=filename
+ )
+
+ filename2 = f"{USER_ID}/{CAHA_PORTFOLIO_ID}/asset_list2.csv"
+ save_csv_to_s3(
+ dataframe=asset_list2,
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=filename2
+ )
+
+ # Store the non-invasive recommendations in s3
+ non_invasive_recommendations_filename = f"{USER_ID}/{CAHA_PORTFOLIO_ID}/non_invasive_recommendations.csv"
+ save_csv_to_s3(
+ dataframe=pd.DataFrame(non_invasive_recommendations),
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=non_invasive_recommendations_filename
+ )
+
+ body = {
+ "portfolio_id": str(CAHA_PORTFOLIO_ID),
+ "housing_type": "Social",
+ "goal": "Increasing EPC",
+ "goal_value": "C",
+ "trigger_file_path": filename,
+ "already_installed_file_path": "",
+ "patches_file_path": "",
+ "non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
+ "valuation_file_path": "",
+ "scenario_name": "Wave 3 Packages",
+ "multi_plan": True,
+ "budget": None,
+ "exclusions": ["boiler_upgrade"]
+ }
+ print(body)
+
+ body2 = {
+ "portfolio_id": str(CAHA_PORTFOLIO_ID),
+ "housing_type": "Social",
+ "goal": "Increasing EPC",
+ "goal_value": "C",
+ "trigger_file_path": filename2,
+ "already_installed_file_path": "",
+ "patches_file_path": "",
+ "non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
+ "valuation_file_path": "",
+ "scenario_name": "Wave 3 Packages",
+ "multi_plan": True,
+ "budget": None,
+ "exclusions": ["boiler_upgrade"]
+ }
+ print(body2)
+
+ #
+ asset_list3 = [
+ {
+ "address": "10b Forest Gardens", "postcode": "N17 6XA", "uprn": 100021180197
+ }
+ ]
+ filename3 = f"{USER_ID}/{CAHA_PORTFOLIO_ID}/asset_list3.csv"
+ save_csv_to_s3(
+ dataframe=pd.DataFrame(asset_list3),
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=filename3
+ )
+ body3 = {
+ "portfolio_id": str(119),
+ "housing_type": "Social",
+ "goal": "Increasing EPC",
+ "goal_value": "C",
+ "trigger_file_path": filename3,
+ "already_installed_file_path": "",
+ "patches_file_path": "",
+ "non_invasive_recommendations_file_path": "",
+ "valuation_file_path": "",
+ "scenario_name": "Wave 3 Packages",
+ "multi_plan": True,
+ "budget": None,
+ "exclusions": ["boiler_upgrade"]
+ }
+ print(body3)
diff --git a/etl/customers/l_and_g/ic_asset_list.py b/etl/customers/l_and_g/ic_asset_list.py
new file mode 100644
index 00000000..d0966bdf
--- /dev/null
+++ b/etl/customers/l_and_g/ic_asset_list.py
@@ -0,0 +1,166 @@
+"""
+This script prepares the asset list for modelling the properties from the L&Q dataset, for their January IC
+"""
+
+import pandas as pd
+import numpy as np
+
+from etl.route_march_data_pull.app import get_data
+from utils.s3 import save_csv_to_s3
+
+PORTFOLIO_ID = 124
+USER_ID = 8
+
+
+def app():
+ asset_data = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/Basildon information for Domna/Basildon MDS v1.4 "
+ "(1).xlsx",
+ sheet_name="Basildon",
+ header=5
+ )
+
+ asset_data = asset_data.head(-3)
+
+ asset_data["address1"] = np.where(
+ pd.isnull(asset_data["Address 1"]),
+ asset_data["Address 2"],
+ asset_data["Address 1"]
+ )
+
+ asset_data["full_address"] = np.where(
+ pd.isnull(asset_data["Address 1"]),
+ asset_data["Address 2"] + ", " + asset_data["Address 3"],
+ asset_data["Address 1"] + ", " + asset_data["Address 2"] + ", " + asset_data["Address 3"],
+ )
+
+ asset_list = asset_data[["address1", "PostCode", "full_address", "Bedrooms"]]
+
+ asset_list = asset_list.reset_index(drop=True)
+
+ asset_list["row_id"] = asset_list.index
+
+ # L&G's focus:
+ # Measures: loft and cavity insulation, replacement thermally efficient windows, PV cells, AS heat pumps.
+
+ epc_data, errors, no_epc = get_data(
+ asset_list=asset_list,
+ fulladdress_column="full_address",
+ address1_column="address1",
+ postcode_column="PostCode",
+ manual_uprn_map={}
+ )
+
+ missed = asset_list[
+ asset_list["row_id"].isin(no_epc)
+ ]
+
+ # We merge on the property types, where we have them
+ missed = missed.merge(
+ asset_data[["address1", "PostCode", "Property Type"]],
+ how="left",
+ on=["address1", "PostCode"]
+ )
+ # Remap Block: Residential to Flat
+ missed["Property Type"] = np.where(
+ missed["Property Type"] == "Block: Residential",
+ "Flat",
+ missed["Property Type"]
+ )
+
+ # We create the asset list - we have some properties that genuninely never had an EPC
+
+ epc_df = pd.DataFrame(epc_data)
+ fetched_asset_list = epc_df[["address1", "postcode", "uprn", "row_id"]]
+ fetched_asset_list = fetched_asset_list.merge(
+ asset_list[["row_id", "Bedrooms"]],
+ how="left",
+ on=["row_id"]
+ )
+
+ missed = missed.rename(columns={"PostCode": "postcode"}).drop(columns=["row_id"])
+
+ # missed.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/missed_epcs.csv")
+ missed_uprns = pd.read_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/missed_epcs_uprn.csv",
+ )
+
+ missed = missed.merge(
+ missed_uprns[["address1", "postcode", "UPRN"]].rename(
+ columns={"UPRN": "uprn"},
+ ),
+ how="left",
+ on=["address1", "postcode"]
+ )
+
+ fetched_asset_list = fetched_asset_list.drop(columns=["row_id"])
+ # We concatename them
+ final_asset_list = pd.concat(
+ [fetched_asset_list, missed[["address1", "postcode", "Property Type", "Bedrooms", "uprn"]]]
+ )
+
+ final_asset_list = final_asset_list.rename(
+ columns={
+ "address1": "address",
+ "Property Type": "property_type",
+ "Bedrooms": "n_bedrooms"
+ }
+ )
+
+ # Finally, we merge on the numeber of bedrooms
+
+ # Extract the non-invasive recommendations:
+ non_invasive_recommendations = []
+ for x in epc_data:
+ non_invasive_recommendations.append(
+ {
+ "uprn": x["uprn"],
+ "recommendations": x["find_my_epc_data"]["recommendations"]
+ }
+ )
+
+ filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv"
+ save_csv_to_s3(
+ dataframe=pd.DataFrame(final_asset_list),
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=filename
+ )
+
+ # Store the non-invasive recommendations in s3
+ non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv"
+ save_csv_to_s3(
+ dataframe=pd.DataFrame(non_invasive_recommendations),
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=non_invasive_recommendations_filename
+ )
+
+ # Store the valuations data in s3
+ # valuations_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuations.csv"
+ # save_csv_to_s3(
+ # dataframe=pd.DataFrame(valuations_data),
+ # bucket_name="retrofit-plan-inputs-dev",
+ # file_name=valuations_filename
+ # )
+
+ body = {
+ "portfolio_id": str(PORTFOLIO_ID),
+ "housing_type": "Private",
+ "goal": "Increasing EPC",
+ "goal_value": "A",
+ "trigger_file_path": filename,
+ "already_installed_file_path": "",
+ "patches_file_path": "",
+ "non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
+ "valuation_file_path": "",
+ "scenario_name": "Retrofit Packages",
+ "multi_plan": True,
+ "budget": None,
+ "inclusions": [
+ "cavity_wall_insulation",
+ "loft_insulation",
+ "windows",
+ "solar_pv",
+ "air_source_heat_pump"
+ ]
+ }
+ print(body)
diff --git a/etl/customers/l_and_g/ic_slides.py b/etl/customers/l_and_g/ic_slides.py
new file mode 100644
index 00000000..a5cb3511
--- /dev/null
+++ b/etl/customers/l_and_g/ic_slides.py
@@ -0,0 +1,246 @@
+import pandas as pd
+from backend.app.utils import sap_to_epc
+
+data = pd.read_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/basildon_age_breakdowns/property_202501170837.csv"
+)
+
+data["year_built"].value_counts()
+
+# 1950-1966 26
+# 1967-1975 37
+# 1976-1982 37
+# 1983-1990 33
+# 1991-1995 139
+# 1996-2002 42
+# 2003-2006 50
+
+data["full_property_type"] = data["property_type"] + ": " + data["built_form"]
+
+houses = data[data["property_type"].isin(["House", "Bungalow"])]
+houses["built_form"].value_counts()
+
+data["property_type"].value_counts()
+data["full_property_type"].value_counts()
+# House: Mid-Terrace 136
+# House: End-Terrace 83
+# House: Semi-Detached 55
+# Flat: Semi-Detached 24
+# Flat: End-Terrace 19
+# House: Detached 10
+# Flat: Mid-Terrace 9
+# Maisonette: Mid-Terrace 9
+# Maisonette: Semi-Detached 8
+# Maisonette: End-Terrace 6
+# Flat: Detached 4
+# Bungalow: Detached 1
+
+epc_data = pd.read_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/basildon_age_breakdowns/basildon EPC Data.csv"
+)
+
+# Classify floor area in <73m2, 73-98, 99-200, 200+
+epc_data["floor_area_bracket"] = epc_data["total_floor_area"].apply(
+ lambda x: "<73" if x < 73 else "73-98" if x < 99 else "99-200" if x < 200 else "200+")
+
+# 73-98 185
+# <73 156
+# 99-200 23
+
+epc_data["wall_type"] = epc_data["walls"].str.split(",").str[0]
+epc_data["wall_type"].value_counts()
+
+# Cavity wall 343
+# Timber frame 15
+# System built 6
+
+# we pull some additional data
+# We want:
+# 1) The list of properties included in the portfolio, with uprn
+# 2) The recommendations against each property with costs, and whether or not the recommendation was defaulted
+# 3) The properties without recommendations and why
+
+from tqdm import tqdm
+import pandas as pd
+import numpy as np
+from sqlalchemy.orm import sessionmaker
+from backend.app.db.connection import db_engine
+from backend.app.db.models.recommendations import Recommendation, Plan, PlanRecommendations
+from backend.app.db.models.portfolio import PropertyModel, PropertyDetailsEpcModel
+
+
+def get_data(portfolio_id, scenario_ids):
+ session = sessionmaker(bind=db_engine)()
+ session.begin()
+
+ # Get properties and their details for a specific portfolio
+ properties_query = session.query(
+ PropertyModel,
+ PropertyDetailsEpcModel
+ ).join(
+ PropertyDetailsEpcModel, PropertyModel.id == PropertyDetailsEpcModel.property_id
+ ).filter(
+ PropertyModel.portfolio_id == portfolio_id # Filter by portfolio ID
+ ).all()
+
+ # Transform properties data to include all fields dynamically
+ properties_data = [
+ {**{col.name: getattr(prop.PropertyModel, col.name) for col in PropertyModel.__table__.columns},
+ **{col.name: getattr(prop.PropertyDetailsEpcModel, col.name) for col in
+ PropertyDetailsEpcModel.__table__.columns}}
+ for prop in properties_query
+ ]
+
+ # Get property IDs from fetched properties
+
+ # Get plans linked to the fetched properties
+ plans_query = session.query(Plan).filter(Plan.scenario_id.in_(scenario_ids)).all()
+
+ # Transform plans data to include all fields dynamically
+ plans_data = [
+ {col.name: getattr(plan, col.name) for col in Plan.__table__.columns}
+ for plan in plans_query
+ ]
+
+ # Extract plan IDs for filtering recommendations through PlanRecommendations
+ plan_ids = [plan['id'] for plan in plans_data]
+
+ # Get recommendations through PlanRecommendations for those plans and that are default
+ recommendations_query = session.query(
+ Recommendation,
+ Plan.scenario_id
+ ).join(
+ PlanRecommendations, Recommendation.id == PlanRecommendations.recommendation_id
+ ).join(
+ Plan, Plan.id == PlanRecommendations.plan_id # Join with Plan to access scenario_id
+ ).filter(
+ PlanRecommendations.plan_id.in_(plan_ids),
+ Recommendation.default == True # Filtering for default recommendations
+ ).all()
+
+ # Transform recommendations data to include all fields dynamically and include scenario_id
+ recommendations_data = [
+ {**{col.name: getattr(rec.Recommendation, col.name) if hasattr(rec, 'Recommendation') else getattr(rec,
+ col.name) for
+ col in Recommendation.__table__.columns},
+ "Scenario ID": rec.scenario_id}
+ for rec in recommendations_query
+ ]
+
+ session.close()
+
+ return properties_data, plans_data, recommendations_data
+
+
+properties_data, plans_data, recommendations_data = get_data(portfolio_id=124, scenario_ids=[205])
+
+properties_df = pd.DataFrame(properties_data)
+plans_df = pd.DataFrame(plans_data)
+recommendations_df = pd.DataFrame(recommendations_data)
+
+recommended_measures_df = recommendations_df[
+ ["property_id", "measure_type", "estimated_cost", "default"]
+]
+recommended_measures_df = recommended_measures_df[recommended_measures_df["default"]]
+recommended_measures_df = recommended_measures_df.drop(columns=["default"])
+
+post_install_sap = recommendations_df[["property_id", "default", "sap_points"]]
+post_install_sap = post_install_sap[post_install_sap["default"]]
+# Sum up the sap points by property id
+post_install_sap = post_install_sap.groupby("property_id")[["sap_points"]].sum().reset_index()
+
+recommendations_measures_pivot = recommended_measures_df.pivot(
+ index='property_id',
+ columns='measure_type',
+ values='estimated_cost'
+)
+recommendations_measures_pivot = recommendations_measures_pivot.reset_index()
+
+recommendations_measures_pivot = recommendations_measures_pivot.rename(
+ columns={
+ "air_source_heat_pump": "Cost: Air Source Heat Pump",
+ "cavity_wall_insulation": "Cost: Cavity Wall Insulation",
+ "double_glazing": "Cost: Double Glazing",
+ "loft_insulation": "Cost: Loft Insulation",
+ "mechanical_ventilation": "Cost: Ventilation",
+ "solar_pv": "Cost: Solar PV"
+ }
+)
+recommendations_measures_pivot = recommendations_measures_pivot.fillna(0)
+recommendations_measures_pivot["Recommendation: Air Source Heat Pump"] = (
+ recommendations_measures_pivot["Cost: Air Source Heat Pump"] > 0
+)
+recommendations_measures_pivot["Recommendation: Cavity Wall Insulation"] = (
+ recommendations_measures_pivot["Cost: Cavity Wall Insulation"] > 0
+)
+recommendations_measures_pivot["Recommendation: Double Glazing"] = (
+ recommendations_measures_pivot["Cost: Double Glazing"] > 0
+)
+recommendations_measures_pivot["Recommendation: Loft Insulation"] = (
+ recommendations_measures_pivot["Cost: Loft Insulation"] > 0
+)
+recommendations_measures_pivot["Recommendation: Ventilation"] = (
+ recommendations_measures_pivot["Cost: Ventilation"] > 0
+)
+recommendations_measures_pivot["Recommendation: Solar PV"] = (
+ recommendations_measures_pivot["Cost: Solar PV"] > 0
+)
+
+df = properties_df[
+ [
+ "property_id", "uprn", "address", "postcode", "property_type", "walls", "roof", "heating", "windows",
+ "current_epc_rating",
+ "current_sap_points", "total_floor_area", "number_of_rooms",
+ ]
+].merge(
+ recommendations_measures_pivot, how="left", on="property_id"
+).merge(
+ post_install_sap, how="left", on="property_id"
+)
+
+df = df.drop(columns=["property_id"])
+df["sap_points"] = df["sap_points"].fillna(0)
+
+df = df.rename(
+ columns={
+ "uprn": "UPRN",
+ "address": "Address",
+ "postcode": "Postcode",
+ "walls": "Walls",
+ "roof": "Roof",
+ "heating": "Heating",
+ "windows": "Windows",
+ "current_epc_rating": "Current EPC Rating",
+ "current_sap_points": "Current SAP Points",
+ "total_floor_area": "Total Floor Area",
+ "number_of_rooms": "Number of Habitable Rooms",
+ "floor_height": "Floor Height",
+ }
+)
+
+df["Has Recommendations"] = ~pd.isnull(df["Cost: Air Source Heat Pump"])
+
+# We fill missings:
+for col in [
+ "Recommendation: Air Source Heat Pump", "Recommendation: Cavity Wall Insulation",
+ "Recommendation: Double Glazing", "Recommendation: Loft Insulation", "Recommendation: Ventilation",
+ "Recommendation: Solar PV"
+]:
+ df[col] = df[col].fillna(False)
+
+for col in [
+ "Cost: Air Source Heat Pump", "Cost: Cavity Wall Insulation",
+ "Cost: Double Glazing", "Cost: Loft Insulation", "Cost: Ventilation",
+ "Cost: Solar PV"
+]:
+ df[col] = df[col].fillna(0)
+
+# Calculate post SAP
+df["Predicted Post Works SAP"] = df["Current SAP Points"] + df["sap_points"]
+df["Predicted Post Works SAP"] = df["Predicted Post Works SAP"].round()
+df["Predicted Post Works EPC"] = df["Predicted Post Works SAP"].apply(lambda x: sap_to_epc(x))
+
+df["Recommendation: Air Source Heat Pump"].sum()
+df["Cost: Air Source Heat Pump"].sum()
+
+df.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/Basildon Data Export - 2.csv", index=False)
diff --git a/etl/customers/lambeth/re-knocks.py b/etl/customers/lambeth/re-knocks.py
new file mode 100644
index 00000000..1de91b50
--- /dev/null
+++ b/etl/customers/lambeth/re-knocks.py
@@ -0,0 +1,23 @@
+import pandas as pd
+
+data = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Downloads/Lambeth Reknocks.xlsx", sheet_name="Possible Route",
+ header=1
+)
+
+data["Outcomes"].value_counts()
+
+# Strip out: No
+
+df = data[data["Outcomes"] == "See notes"]
+notes_df = df[
+ ("Notes (If 'no answer' under outcomes, have you checked around the property for access issues where "
+ "possible?)")].value_counts().to_frame()
+
+example = df[df["Notes (If 'no answer' under outcomes, have you checked around the property for access issues where "
+ "possible?)"] == ('Access to rear of property only through number 10. Overgrown athe rear of property '
+ 'installer wont be able to access')
+ ]
+
+# 18 did not attend
+#
diff --git a/etl/customers/livewest/route_march_2024_10_28.py b/etl/customers/livewest/route_march_2024_10_28.py
new file mode 100644
index 00000000..1b259fba
--- /dev/null
+++ b/etl/customers/livewest/route_march_2024_10_28.py
@@ -0,0 +1,225 @@
+import os
+import time
+
+import pandas as pd
+from tqdm import tqdm
+
+from dotenv import load_dotenv
+from utils.s3 import read_excel_from_s3
+from backend.SearchEpc import SearchEpc
+from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
+
+from recommendations.recommendation_utils import (
+ estimate_perimeter,
+ estimate_external_wall_area,
+ estimate_number_of_floors
+)
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+
+def get_data(asset_list):
+ epc_data = []
+ errors = []
+ for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
+ try:
+ postcode = home["Postcode"]
+ house_number = home["Number"]
+ full_address = home["Full Address"]
+
+ searcher = SearchEpc(
+ address1=str(house_number),
+ postcode=postcode,
+ auth_token=EPC_AUTH_TOKEN,
+ os_api_key="",
+ property_type=None,
+ fast=True,
+ full_address=full_address,
+ max_retries=5
+ )
+ # Force the skipping of estimating the EPC
+ searcher.ordnance_survey_client.property_type = None
+ searcher.ordnance_survey_client.built_form = None
+
+ searcher.find_property(skip_os=True)
+ if searcher.newest_epc is None:
+ continue
+
+ # Look for EPC recommendatons
+ try:
+ property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
+ except:
+ property_recommendations = {"rows": []}
+
+ epc = {
+ "row_id": home["row_id"],
+ **searcher.newest_epc.copy(),
+ "recommendations": property_recommendations["rows"]
+ }
+
+ epc_data.append(epc)
+ except Exception as e:
+ errors.append(home["row_id"])
+ time.sleep(5)
+
+ return epc_data, errors
+
+
+def app():
+ """
+ This app is EPC pulling data for some properties owned by Livewest
+
+ Data request contents:
+ Date of last EPC
+ Reason for EPC
+ SAP score on register
+ Property Type
+ Property Area
+ Property Age
+ Any Dimensions (HLP,PW,RH)
+ Property Wall Construction
+ Heating Type
+ Secondary Heating
+ Loft Insulation Depth
+
+ Additional if possible:
+ Heat loss calculations
+ EPC recommendations
+ Property UPRN
+
+ """
+ asset_list = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Downloads/LIVEWEST 3578 ECO4 ECO PLUS GBIS.xlsx", header=0
+ )
+ asset_list["row_id"] = asset_list.index
+
+ epc_data, errors = get_data(asset_list)
+
+ # We now retrieve any failed properties
+ asset_list_failed = asset_list[asset_list["row_id"].isin(errors)]
+ epc_data_failed, _ = get_data(asset_list_failed)
+
+ # Append the failed data to the main data
+ epc_data.extend(epc_data_failed)
+
+ epc_df = pd.DataFrame(epc_data)
+
+ # We expand out the recommendations
+ recommendations_df = epc_df[["row_id", "recommendations"]]
+
+ unique_recommendations = set()
+ for _, row in recommendations_df.iterrows():
+ unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]])
+
+ columns = ["row_id"] + list(unique_recommendations)
+ transformed_data = []
+ for _, row in recommendations_df.iterrows():
+ # Initialize a dictionary for this row with False for all recommendations
+ row_data = {col: False for col in columns}
+ row_data["row_id"] = row["row_id"]
+
+ # Set True for each recommendation present in this row
+ for rec in row["recommendations"]:
+ recommendation_text = rec["improvement-summary-text"]
+ row_data[recommendation_text] = True
+
+ # Append the row data to transformed_data
+ transformed_data.append(row_data)
+
+ transformed_df = pd.DataFrame(transformed_data)
+ # Drop the column that is ""
+ transformed_df = transformed_df.drop(columns=[""])
+
+ # Retrieve just the data we need
+ epc_df = epc_df[
+ [
+ "row_id",
+ "uprn",
+ "property-type",
+ "built-form",
+ "inspection-date",
+ "current-energy-rating",
+ "current-energy-efficiency",
+ "roof-description",
+ "walls-description",
+ "transaction-type",
+ # New fields needed
+ "secondheat-description",
+ "total-floor-area",
+ "construction-age-band",
+ "floor-height",
+ "number-habitable-rooms",
+ "mainheat-description",
+ #
+ "energy-consumption-current", # kwh/m2
+ ]
+ ]
+
+ asset_list = asset_list.merge(
+ epc_df,
+ how="left",
+ on="row_id"
+ ).merge(
+ transformed_df,
+ how="left",
+ on="row_id"
+ )
+
+ asset_list = asset_list.drop(columns=["row_id"])
+
+ # Rename the columns
+ asset_list = asset_list.rename(columns={
+ "inspection-date": "Date of last EPC",
+ "current-energy-efficiency": "SAP score on register",
+ "current-energy-rating": "EPC rating on register",
+ "property-type": "Property Type",
+ "built-form": "Archetype",
+ "total-floor-area": "Property Floor Area",
+ "construction-age-band": "Property Age Band",
+ "floor-height": "Property Floor Height",
+ "number-habitable-rooms": "Number of Habitable Rooms",
+ "walls-description": "Wall Construction",
+ "roof-description": "Roof Construction",
+ "mainheat-description": "Heating Type",
+ "secondheat-description": "Secondary Heating",
+ "transaction-type": "Reason for last EPC",
+ "energy-consumption-current": "Heat Demand (kWh/m2)"
+ })
+
+ asset_list["Estimated Number of Floors"] = asset_list.apply(
+ lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull(
+ x["Property Type"]) else None, axis=1
+ )
+
+ asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float)
+ # Replace "" value with None
+ asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None)
+ asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float)
+
+ asset_list["Estimated Perimeter (m)"] = asset_list.apply(
+ lambda x: estimate_perimeter(
+ floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"],
+ num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"],
+ ), axis=1
+ )
+
+ asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply(
+ lambda x: estimate_external_wall_area(
+ num_floors=x["Estimated Number of Floors"],
+ floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5,
+ perimeter=x["Estimated Perimeter (m)"],
+ built_form=x["Archetype"]
+ ),
+ axis=1
+ )
+
+ asset_list["Roof Insulation Thickness"] = asset_list.apply(
+ lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull(
+ x["Roof Construction"]) else None,
+ axis=1
+ )
+
+ # Store as an excel
+ filename = "livewest EPC Data pull - 29 Oct.xlsx"
+ asset_list.to_excel(filename, index=False)
diff --git a/etl/customers/mod/pilot/1. Create Sample.py b/etl/customers/mod/pilot/1. Create Sample.py
new file mode 100644
index 00000000..fd045294
--- /dev/null
+++ b/etl/customers/mod/pilot/1. Create Sample.py
@@ -0,0 +1,205 @@
+import os
+import pandas as pd
+from tqdm import tqdm
+from dotenv import load_dotenv
+from backend.SearchEpc import SearchEpc
+from etl.spatial.OpenUprnClient import OpenUprnClient
+from asset_list.utils import get_data
+from utils.s3 import save_csv_to_s3
+
+PORTFOLIO_ID = 139
+USER_ID = 8
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+
+def app():
+ """
+ Given the sample data and additonal properties, this function prepares the data
+ :return:
+ """
+ folder_path = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme"
+ sample_list = pd.read_excel(f"{folder_path}/20250227_DIO_Accommodation_Sample_Properties.xlsx")
+ asset_data = pd.read_excel(f"{folder_path}/20250303_DIO_Accommodation_Property_Attribution.xlsx")
+
+ sample_list = sample_list[sample_list["BLDNG_COUNTRY_NAME"].isin(["ENGLAND", "WALES"])]
+
+ # Merge on the UPRN
+ sample_list = sample_list.merge(
+ asset_data[["BLDNG_ID", "BLNDG_GOVERMENT_UPRN"]].drop_duplicates(),
+ how="left", on="BLDNG_ID"
+ )
+ sample_list["BLNDG_GOVERMENT_UPRN"] = sample_list["BLNDG_GOVERMENT_UPRN"].astype("Int64")
+
+ # Use the EPC API to get corrected postcodes
+ model_asset_list = []
+ missed = []
+ for _, x in tqdm(sample_list.iterrows(), total=len(sample_list)):
+
+ if pd.isnull(x["BLNDG_GOVERMENT_UPRN"]):
+ continue
+ searcher = SearchEpc(
+ address1="",
+ postcode="",
+ uprn=x["BLNDG_GOVERMENT_UPRN"],
+ auth_token=EPC_AUTH_TOKEN,
+ os_api_key=""
+ )
+ searcher.find_property(skip_os=True)
+ newest_epc = searcher.newest_epc
+ if newest_epc is None:
+ missed.append(x["BLNDG_GOVERMENT_UPRN"])
+ continue
+
+ model_asset_list.append(newest_epc)
+
+ model_asset_list = pd.DataFrame(model_asset_list)
+ model_asset_list["uprn"] = model_asset_list["uprn"].astype(int)
+
+ spatial_data = OpenUprnClient.get_spatial_data(
+ uprns=model_asset_list["uprn"].tolist(), bucket_name="retrofit-data-dev"
+ )
+
+ # We determine if the building is listed, heritage or in a conservation area
+
+ # Merge on the property features
+ features = asset_data.drop(
+ columns=["BUILDING_SYSTEM_ITEM_NAME", "OBSERVED_CONDITION_DESCRIPTION"]
+ ).drop_duplicates()
+
+ df = features.merge(
+ model_asset_list, how="inner", right_on="uprn", left_on="BLNDG_GOVERMENT_UPRN"
+ ).merge(
+ pd.DataFrame(spatial_data).rename(columns={"UPRN": "uprn"}), how="left", on="uprn"
+ )
+
+ # Store data locally
+ # df.to_csv(folder_path + "/MOD property data.csv", index=False)
+
+ # Produce as asset list for analysis
+
+ df["row_id"] = df.index
+
+ epc_data, errors, no_epc = get_data(
+ df=df,
+ manual_uprn_map={},
+ epc_auth_token=EPC_AUTH_TOKEN,
+ uprn_column="uprn",
+ fulladdress_column="address",
+ address1_column="address1",
+ postcode_column="postcode",
+ property_type_column=None,
+ built_form_column=None,
+ epc_api_only=False,
+ row_id_name="row_id",
+ )
+
+ non_invasive_recommendations = []
+ for x in epc_data:
+ non_invasive_recommendations.append(
+ {
+ "uprn": x["uprn"],
+ "recommendations": x["find_my_epc_data"]["recommendations"]
+ }
+ )
+
+ # also include the floor area
+ asset_list = df[
+ ["uprn", "address1", "postcode", "NUMBER_OF_BEDROOMS", "BLDNG_STOREYS_QTY", "BLDNG_MSRMNT_VAL"]
+ ].rename(
+ columns={
+ "address1": "address",
+ "NUMBER_OF_BEDROOMS": "n_bedrooms",
+ "BLDNG_STOREYS_QTY": "number_of_floors",
+ "BLDNG_MSRMNT_VAL": "floor_area"
+ }
+ )
+
+ filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv"
+ save_csv_to_s3(
+ dataframe=asset_list,
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=filename
+ )
+
+ # Store the non-invasive recommendations in s3
+ non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv"
+ save_csv_to_s3(
+ dataframe=pd.DataFrame(non_invasive_recommendations),
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=non_invasive_recommendations_filename
+ )
+
+ # Scenario 1 - EPC C
+ body = {
+ "portfolio_id": str(PORTFOLIO_ID),
+ "housing_type": "Private",
+ "goal": "Increasing EPC",
+ "goal_value": "C",
+ "trigger_file_path": filename,
+ "already_installed_file_path": "",
+ "patches_file_path": "",
+ "non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
+ "valuation_file_path": "",
+ "scenario_name": "Hit EPC C",
+ "multi_plan": True,
+ "budget": None,
+ # "inclusions": [
+ # "cavity_wall_insulation",
+ # "loft_insulation",
+ # "windows",
+ # "solar_pv",
+ # "air_source_heat_pump"
+ # ]
+ }
+ print(body)
+
+ # Scenario 2 - EPC B
+ body = {
+ "portfolio_id": str(PORTFOLIO_ID),
+ "housing_type": "Private",
+ "goal": "Increasing EPC",
+ "goal_value": "B",
+ "trigger_file_path": filename,
+ "already_installed_file_path": "",
+ "patches_file_path": "",
+ "non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
+ "valuation_file_path": "",
+ "scenario_name": "Hit EPC B",
+ "multi_plan": True,
+ "budget": None,
+ # "inclusions": [
+ # "cavity_wall_insulation",
+ # "loft_insulation",
+ # "windows",
+ # "solar_pv",
+ # "air_source_heat_pump"
+ # ]
+ }
+ print(body)
+
+ # Scenario 3 - EPC B, 3.5 COP ASHP
+ body = {
+ "portfolio_id": str(PORTFOLIO_ID),
+ "housing_type": "Private",
+ "goal": "Increasing EPC",
+ "goal_value": "B",
+ "trigger_file_path": filename,
+ "already_installed_file_path": "",
+ "patches_file_path": "",
+ "non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
+ "valuation_file_path": "",
+ "scenario_name": "Hit EPC B - 3.5 COP ASHP",
+ "multi_plan": True,
+ "budget": None,
+ "ashp_cop": 3.5
+ # "inclusions": [
+ # "cavity_wall_insulation",
+ # "loft_insulation",
+ # "windows",
+ # "solar_pv",
+ # "air_source_heat_pump"
+ # ]
+ }
+ print(body)
diff --git a/etl/customers/mod/pilot/2. Create Excel Model.py b/etl/customers/mod/pilot/2. Create Excel Model.py
new file mode 100644
index 00000000..9a9eda86
--- /dev/null
+++ b/etl/customers/mod/pilot/2. Create Excel Model.py
@@ -0,0 +1,652 @@
+from pprint import pprint
+import pandas as pd
+import numpy as np
+from backend.app.utils import sap_to_epc
+from sqlalchemy.orm import sessionmaker
+from backend.app.db.connection import db_engine
+from backend.app.db.models.recommendations import Recommendation, Plan, PlanRecommendations
+from backend.app.db.models.portfolio import PropertyModel, PropertyDetailsEpcModel
+
+
+def get_data(portfolio_id, scenario_ids):
+ session = sessionmaker(bind=db_engine)()
+ session.begin()
+
+ # Get properties and their details for a specific portfolio
+ properties_query = session.query(
+ PropertyModel,
+ PropertyDetailsEpcModel
+ ).join(
+ PropertyDetailsEpcModel, PropertyModel.id == PropertyDetailsEpcModel.property_id
+ ).filter(
+ PropertyModel.portfolio_id == portfolio_id # Filter by portfolio ID
+ ).all()
+
+ # Transform properties data to include all fields dynamically
+ properties_data = [
+ {**{col.name: getattr(prop.PropertyModel, col.name) for col in PropertyModel.__table__.columns},
+ **{col.name: getattr(prop.PropertyDetailsEpcModel, col.name) for col in
+ PropertyDetailsEpcModel.__table__.columns}}
+ for prop in properties_query
+ ]
+
+ # Get property IDs from fetched properties
+
+ # Get plans linked to the fetched properties
+ plans_query = session.query(Plan).filter(Plan.scenario_id.in_(scenario_ids)).all()
+
+ # Transform plans data to include all fields dynamically
+ plans_data = [
+ {col.name: getattr(plan, col.name) for col in Plan.__table__.columns}
+ for plan in plans_query
+ ]
+
+ # Extract plan IDs for filtering recommendations through PlanRecommendations
+ plan_ids = [plan['id'] for plan in plans_data]
+
+ # Get recommendations through PlanRecommendations for those plans and that are default
+ recommendations_query = session.query(
+ Recommendation,
+ Plan.scenario_id
+ ).join(
+ PlanRecommendations, Recommendation.id == PlanRecommendations.recommendation_id
+ ).join(
+ Plan, Plan.id == PlanRecommendations.plan_id # Join with Plan to access scenario_id
+ ).filter(
+ PlanRecommendations.plan_id.in_(plan_ids),
+ Recommendation.default == True # Filtering for default recommendations
+ ).all()
+
+ # Transform recommendations data to include all fields dynamically and include scenario_id
+ recommendations_data = [
+ {**{col.name: getattr(rec.Recommendation, col.name) if hasattr(rec, 'Recommendation')
+ else getattr(rec, col.name) for
+ col in Recommendation.__table__.columns},
+ "Scenario ID": rec.scenario_id}
+ for rec in recommendations_query
+ ]
+
+ session.close()
+
+ return properties_data, plans_data, recommendations_data
+
+
+def app():
+ """
+ Given a portfolio and a scenario, this function prepares an excel model to present the data
+ """
+
+ # Set the inputs:
+ portfolio_id = 139
+ scenario_ids = [237, 238]
+
+ properties_data, plans_data, recommendations_data = get_data(
+ portfolio_id=portfolio_id, scenario_ids=scenario_ids
+ )
+
+ properties_df = pd.DataFrame(properties_data)
+ plans_df = pd.DataFrame(plans_data)
+ recommendations_df = pd.DataFrame(recommendations_data)
+
+ # Merge on the orignal data
+ mod_property_data = pd.read_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/MOD property data.csv"
+ )
+
+ property_asset_data = properties_df.merge(
+ mod_property_data.drop(columns=["address", "postcode", "tenure"]), how="left", on="uprn"
+ )
+
+ property_asset_data["is_pitched"] = property_asset_data["roof"].str.contains("pitched", case=False)
+ property_asset_data["pre_1970"] = property_asset_data["BUILD_YEAR"] < 1970
+ property_asset_data["wall_type"] = property_asset_data["walls"].str.split(" ").str[0].str.strip()
+ property_asset_data["is_insulated"] = (
+ property_asset_data["walls"].str.split(",").str[1].str.strip().isin(
+ ["filled cavity", "with external insulation", "filled cavity and external insulation"]
+ ) | property_asset_data["walls"].str.split(",").str[2].str.strip().isin(["insulated"])
+ )
+ property_asset_data["is_insulated"] = np.where(
+ property_asset_data["is_insulated"], "Insulated", "Uninsulated"
+ )
+ property_asset_data["is_pitched"] = np.where(
+ property_asset_data["is_pitched"], "Pitched roof", "Not Pitched Roof"
+ )
+ property_asset_data["pre_1970"] = np.where(
+ property_asset_data["pre_1970"], "Pre 1970", "Post 1970"
+ )
+
+ archetype_variables = ["property_type", "wall_type", "is_insulated", "is_pitched", "pre_1970"]
+
+ assigned_archetypes = (
+ property_asset_data.groupby(
+ archetype_variables
+ ).size().reset_index().rename(columns={0: "n_properties"}).sort_values("n_properties", ascending=False)
+ )
+
+ # Make the archetype ID a concatenation of the variables
+ assigned_archetypes["archetype_id"] = assigned_archetypes[archetype_variables].apply(
+ lambda x: "_".join(x.astype(str)), axis=1
+ )
+
+ # Most prominent archetypes
+ prominent_archetypes = assigned_archetypes.head(6)
+ other_archetypes = assigned_archetypes.tail(-6)
+ # 2 or fewer properties in the other archetypes
+
+ property_asset_data = property_asset_data.merge(
+ assigned_archetypes[archetype_variables + ["archetype_id"]],
+ how="left",
+ on=archetype_variables
+ )
+
+ # Create age bands:
+ # 1960-1969
+ # 1970-1979
+ # 1980-1989
+ # 1990-1999
+ # 2000+
+ property_asset_data["age_band"] = pd.cut(
+ property_asset_data["BUILD_YEAR"],
+ bins=[1959, 1969, 1979, 1989, 1999, 2022],
+ labels=["1960-1969", "1970-1979", "1980-1989", "1990-1999", "2000+"]
+ )
+
+ # Create floor area bands
+ # 0-73
+ # 74-97
+ # 98-199
+ # 200+
+ property_asset_data["floor_area_band"] = pd.cut(
+ property_asset_data["total_floor_area"],
+ bins=[0, 73, 97, 199, 10000],
+ labels=["0-73", "74-97", "98-199", "200+"]
+ )
+
+ property_asset_data["archetype_group"] = property_asset_data["archetype_id"].copy()
+ property_asset_data["archetype_group"] = np.where(
+ property_asset_data["archetype_id"].isin(other_archetypes["archetype_id"].values),
+ "other",
+ property_asset_data["archetype_group"]
+ )
+
+ # For colour
+ wall_types = (
+ property_asset_data[["wall_type"]].value_counts().to_frame().reset_index().rename(
+ columns={"wall_type": "Wall Type"}
+ )
+ )
+ # Group into age bands
+ ages = (
+ property_asset_data[["age_band"]].value_counts()
+ .to_frame()
+ .reset_index().sort_values("age_band", ascending=True)
+ .rename(columns={"age_band": "Age Band"})
+ )
+ floor_area_bands = (
+ property_asset_data[["floor_area_band"]].value_counts()
+ .to_frame()
+ .reset_index().sort_values("floor_area_band", ascending=True)
+ .rename(columns={"floor_area_band": "Floor Area Band"})
+ )
+ archetype_counts = (
+ property_asset_data[["archetype_group"]].
+ value_counts().
+ to_frame().
+ reset_index()
+ .rename(columns={"archetype_group": "Archetype"})
+ )
+ property_types = (
+ (property_asset_data["property_type"] + ": " + property_asset_data["built_form"]).
+ value_counts().
+ to_frame().
+ reset_index()
+ .rename(columns={"index": "Property Type", 0: "Count"})
+ )
+
+ # epc breakdown
+ epc_breakdown = (
+ property_asset_data["current_epc_rating"]
+ .apply(lambda x: x.value)
+ .value_counts()
+ .to_frame()
+ .reset_index()
+ )
+
+ # Figures for the deck
+ # Carbon per property
+ totals = property_asset_data[
+ [
+ "Total_household_members",
+ "co2_emissions", "current_energy_demand", "current_energy_demand_heating_hotwater",
+ "heating_cost_current", "hot_water_cost_current", "lighting_cost_current",
+ "appliances_cost_current", "gas_standing_charge", "electricity_standing_charge"
+ ]
+ ].copy()
+ totals["total_cost"] = (
+ totals["heating_cost_current"] +
+ totals["hot_water_cost_current"] +
+ totals["lighting_cost_current"] +
+ totals["appliances_cost_current"] +
+ totals["gas_standing_charge"] +
+ totals["electricity_standing_charge"]
+ )
+ print(
+ totals[
+ [
+ "Total_household_members",
+ "co2_emissions",
+ "current_energy_demand",
+ "total_cost",
+ ]
+ ].mean()
+ )
+
+ # Store these to an excel
+ # with pd.ExcelWriter(
+ # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/MOD archetype breakdowns.xlsx"
+ # ) as writer:
+ # wall_types.to_excel(writer, sheet_name="Wall Types", index=False)
+ # ages.to_excel(writer, sheet_name="Ages", index=False)
+ # floor_area_bands.to_excel(writer, sheet_name="Floor Area Bands", index=False)
+ # archetype_counts.to_excel(writer, sheet_name="Archetype Counts", index=False)
+ # epc_breakdown.to_excel(writer, sheet_name="EPC Rating", index=False)
+
+ contingency = 0.26
+
+ # We prepare the outputs, by scenario
+ scenario_data = {}
+ for scenario in scenario_ids:
+
+ scenario_recommendations_df = recommendations_df[
+ recommendations_df["Scenario ID"] == scenario
+ ].copy()
+
+ scenario_recommendations_df["contingency"] = contingency * scenario_recommendations_df["estimated_cost"]
+ scenario_recommendations_df["total_cost"] = (
+ scenario_recommendations_df["estimated_cost"] + scenario_recommendations_df["contingency"]
+ )
+
+ recommended_measures_df = scenario_recommendations_df[
+ ["property_id", "measure_type", "estimated_cost", "default"]
+ ]
+
+ recommended_measures_df = recommended_measures_df[recommended_measures_df["default"]]
+ recommended_measures_df = recommended_measures_df.drop(columns=["default"])
+
+ # Metrics by property ID
+ aggregated_metrics = scenario_recommendations_df[
+ [
+ "property_id", "type", "default", "sap_points",
+ "energy_cost_savings", "kwh_savings", "co2_equivalent_savings", "estimated_cost", "contingency",
+ "total_cost"
+ ]
+ ]
+ aggregated_metrics = aggregated_metrics[aggregated_metrics["default"]]
+ aggregated_metrics = aggregated_metrics.groupby("property_id")[
+ ["sap_points", "co2_equivalent_savings", "energy_cost_savings", "kwh_savings", "estimated_cost",
+ "total_cost", "contingency"]
+ ].sum().reset_index()
+
+ recommendations_measures_pivot = recommended_measures_df.pivot(
+ index='property_id',
+ columns='measure_type',
+ values='estimated_cost'
+ )
+ recommendations_measures_pivot = recommendations_measures_pivot.reset_index()
+ recommendations_measures_pivot = recommendations_measures_pivot.fillna(0)
+
+ # We flag with boolean if the measure is recommended
+ for c in recommendations_measures_pivot.columns:
+ if c == "property_id":
+ continue
+ recommendations_measures_pivot["Recommendation: " + c] = recommendations_measures_pivot[c] > 0
+
+ # We now create a final output
+ df = properties_df[
+ [
+ "property_id", "uprn", "address", "postcode", "property_type", "walls", "roof", "heating", "windows",
+ "current_epc_rating", "current_sap_points", "total_floor_area", "number_of_rooms",
+ "co2_emissions", "current_energy_demand", "current_energy_demand_heating_hotwater",
+ "heating_cost_current", "hot_water_cost_current", "lighting_cost_current",
+ "appliances_cost_current", "gas_standing_charge", "electricity_standing_charge"
+ ]
+ ].merge(
+ recommendations_measures_pivot, how="left", on="property_id"
+ ).merge(
+ aggregated_metrics, how="left", on="property_id"
+ )
+
+ df["bills_total_cost"] = (
+ df["heating_cost_current"] + df["hot_water_cost_current"] + df["lighting_cost_current"] +
+ df["appliances_cost_current"] + df["gas_standing_charge"] + df["electricity_standing_charge"]
+ )
+
+ df = df.drop(columns=["property_id"])
+ for c in ["sap_points", "co2_equivalent_savings", "energy_cost_savings", "kwh_savings"]:
+ df[c] = df[c].fillna(0)
+
+ df = df.rename(
+ columns={
+ "uprn": "UPRN",
+ "address": "Address",
+ "postcode": "Postcode",
+ "walls": "Walls",
+ "roof": "Roof",
+ "heating": "Heating",
+ "windows": "Windows",
+ "current_epc_rating": "Current EPC Rating",
+ "current_sap_points": "Current SAP Points",
+ "total_floor_area": "Total Floor Area",
+ "number_of_rooms": "Number of Habitable Rooms",
+ "floor_height": "Floor Height",
+ }
+ )
+
+ # Calculate post SAP
+ df["Predicted Post Works SAP"] = df["Current SAP Points"] + df["sap_points"]
+ df["Predicted Post Works SAP"] = df["Predicted Post Works SAP"].round()
+ df["Predicted Post Works EPC"] = df["Predicted Post Works SAP"].apply(lambda x: sap_to_epc(x))
+
+ # Calculate the relative savings on carbon, kwh, and bills
+ df["relative_carbon_savings"] = df["co2_equivalent_savings"] / df["co2_emissions"]
+ df["relative_kwh_savings"] = df["kwh_savings"] / df["current_energy_demand"]
+ df["relative_bill_savings"] = df["energy_cost_savings"] / df["bills_total_cost"]
+
+ # Add on the archetype
+ df = df.merge(
+ property_asset_data[["uprn", "archetype_group"]], how="left", left_on="UPRN", right_on="uprn"
+ )
+
+ # For properties that don't make it to EPC B, check why. E.g. for a property that has an oil boiler, it
+ # the bills go up recommending HHRSH, so it doesn't make it to EPC B
+ # For mid-terrace units, use the ordnance survey API to check if there is space for a heat pump?
+ # DO it manually???
+
+ # Doesn't make it
+ # misses = df[df["Predicted Post Works EPC"] == "C"]
+ # # 5 of them are flats and so are difficult to get to EPC B without renewables. Possibly not worth it from an
+ # # ROI perspective
+ #
+ # misses[["UPRN", "Address", "Postcode", "property_type"]]
+
+ # UPRN Address Postcode property_type
+ # 2 100120988937 13 Sidbury Circular Road SP9 7HX Flat No further action
+ # 3 100120988998 74 Sidbury Circular Road SP9 7JA Flat No further action
+ # 4 100120989416 47 Zouch Avenue SP9 7LR Flat No further action
+ # 6 100060585002 42, Muscott Close, Shipton Bellinger SP9 7TX House Can probably take a heat pump
+ # 37 10000801072 34 Luffenham Place, Chicksands SG17 5XH House Already surveyed as having
+ # an ASHP - should be looked at
+ # 121 100120988259 8, Karachi Close SP9 7LW Flat
+ # 122 100121101217 599, Pepper Place BA12 0DW Flat
+ # 140 100021455241 33 Blenheim Crescent, Ruislip HA4 7HA House - Solar isnt recommended
+ # due to bug
+ # 149 100120915656 10 Bower Green, Shrivenham SN6 8TU House - Solar isn't recommended
+ # due to bug
+
+ scenario_data[scenario] = df
+
+ printing_scenario_id = scenario_ids[0]
+ # EPC breakdown
+ print(scenario_data[printing_scenario_id]['Predicted Post Works EPC'].value_counts())
+ # Cost
+ # Total cost
+ print(scenario_data[printing_scenario_id]["total_cost"].sum())
+ # Base cost
+ print(scenario_data[printing_scenario_id]["estimated_cost"].sum())
+ # Contingency
+ print(scenario_data[printing_scenario_id]["contingency"].sum())
+ # Costs averaged per unit
+ print(scenario_data[printing_scenario_id]["total_cost"].mean())
+ print(scenario_data[printing_scenario_id]["estimated_cost"].mean())
+ print(scenario_data[printing_scenario_id]["contingency"].mean())
+
+ # Average relative savings
+ print(scenario_data[printing_scenario_id]["relative_carbon_savings"].mean())
+ print(scenario_data[printing_scenario_id]["relative_kwh_savings"].mean())
+ print(scenario_data[printing_scenario_id]["relative_bill_savings"].mean())
+
+ measure_details = {}
+ for scenario in scenario_ids:
+ measure_details[scenario] = {}
+ recommendation_cols = [c for c in scenario_data[scenario].columns if "Recommendation:" in c]
+ measure_details[scenario]["count"] = scenario_data[scenario][recommendation_cols].sum().to_dict()
+ # Get average cost per measure
+ measure_columns = [
+ c.split("Recommendation: ")[1] for c in scenario_data[scenario].columns if "Recommendation:" in c
+ ]
+ # Take the mean, drop zero columns
+ measure_costs = {}
+ for m in measure_columns:
+ measure_costs[m] = float(scenario_data[scenario][scenario_data[scenario][m] > 0][m].mean())
+ measure_details[scenario]["cost_per_measure"] = measure_costs
+
+ pprint(measure_details[scenario_ids[0]]["count"])
+ pprint(measure_details[scenario_ids[1]]["count"])
+
+ # Cost per measures
+ pprint(measure_details[scenario_ids[0]]["cost_per_measure"])
+ pprint(measure_details[scenario_ids[1]]["cost_per_measure"])
+
+ # Do not get to EPC B:
+ # 5 are flats
+ # 1) 34 Luffenham Place, Chicksands SG17 5XH, has been surveyed as having a low performing heat pump -
+ # should be looked at but several surrounding properties have been surveyed in a similar fashion
+ # 2) 42, Muscott Close, Shipton Bellinger SP9 7TX, has an oil boiler and the bills go up recommending HHRSH.
+ # we could non-intrusively recommend a heat pump.
+ # 3) 33 Blenheim Crescent, Ruislip, HA4 7HA, 100021455241 Solar potential modelling returned nothing -
+ # manual review indicates that there are multiple trees surrouding the south facing side of the property
+ # 4) 10 Bower Green, Shrivenham, SN6 8TU - Solar isn't recommended without further survey due to the local
+ # area being surrounded by trees
+
+ # Scenario adjustments:
+ # Exclude: boiler_upgrade
+ # Make ASHP COP 3.5
+
+ # Metrics we need by scenario:
+ # Cost
+ # contingency
+ # Carbon
+ # kwh
+ # bill savings
+ scenario_metrics = {}
+ for scenario in scenario_ids:
+ df = scenario_data[scenario].copy()
+
+ avg_savings = df[
+ ["sap_points", "co2_equivalent_savings", "energy_cost_savings", "kwh_savings", "estimated_cost",
+ "total_cost", "contingency"]
+ ].mean().to_dict()
+ avg_savings["cost_per_sap_point"] = avg_savings["total_cost"] / avg_savings["sap_points"]
+ avg_savings["cost_per_carbon"] = avg_savings["total_cost"] / avg_savings["co2_equivalent_savings"]
+ scenario_metrics[scenario] = avg_savings
+
+ pprint(scenario_metrics[scenario_ids[0]])
+ pprint(scenario_metrics[scenario_ids[1]])
+
+ scenario_data[scenario_ids[0]]["loft_insulation"][
+ scenario_data[scenario_ids[0]]["loft_insulation"] > 0
+ ].mean()
+
+ scenario_data[scenario_ids[0]]["cavity_wall_insulation"][
+ scenario_data[scenario_ids[0]]["cavity_wall_insulation"] > 0
+ ].mean()
+
+ # Testing checking floor risk
+
+ import requests
+
+ def get_flood_risk(lat, lon, radius_km=1):
+ url = "https://environment.data.gov.uk/flood-monitoring/id/floods"
+ params = {
+ 'lat': lat,
+ 'long': lon,
+ 'dist': radius_km # search radius in km
+ }
+
+ response = requests.get(url, params=params)
+ response.raise_for_status()
+ data = response.json()
+
+ flood_warnings = data.get("items", [])
+
+ if not flood_warnings:
+ print("No active flood warnings near this location.")
+ else:
+ print(f"{len(flood_warnings)} warning(s) found near the location:")
+ for warning in flood_warnings:
+ print(f"- Area: {warning.get('description')}")
+ print(f" Severity: {warning.get('severity')} (Level {warning.get('severityLevel')})")
+ print(f" Message changed at: {warning.get('timeMessageChanged')}")
+ print()
+
+ return flood_warnings
+
+ from shapely.geometry import shape, Point
+ def get_flood_areas_near_point(lat, lon, radius_km=2):
+ url = "https://environment.data.gov.uk/flood-monitoring/id/floodAreas"
+ params = {
+ 'lat': lat,
+ 'long': lon,
+ 'dist': radius_km
+ }
+
+ response = requests.get(url, params=params)
+ response.raise_for_status()
+ return response.json().get("items", [])
+
+ def point_in_flood_area(lat, lon):
+ flood_areas = get_flood_areas_near_point(lat, lon, radius_km=1)
+ point = Point(lon, lat) # GeoJSON uses (lon, lat) format
+
+ for area in flood_areas:
+ polygon_url = area.get("polygon")
+ if not polygon_url:
+ continue
+
+ polygon_response = requests.get(polygon_url)
+ polygon_response.raise_for_status()
+ polygon_geojson = polygon_response.json()
+
+ features = polygon_geojson.get("features", [])
+ if not features:
+ continue
+
+ flood_polygon = shape(features[0]['geometry'])
+
+ try:
+ is_inside = flood_polygon.contains(point)
+ except:
+ is_inside = False
+
+ if is_inside:
+ print(f"📍 Point is inside flood area: {area['label']} ({area['notation']})")
+ return area
+
+ from tqdm import tqdm
+ floor_warnings_data = []
+ for _, property in tqdm(property_asset_data.iterrows(), total=len(property_asset_data)):
+ # warnings = floor_warnings_data.extend(
+ # get_flood_risk(lat=property["LATITUDE"], lon=property["LONGITUDE"], radius_km=1)
+ # )
+
+ resp = point_in_flood_area(lat=property["LATITUDE"], lon=property["LONGITUDE"])
+ if resp:
+ floor_warnings_data.append(
+ {
+ "uprn": property["uprn"],
+ "address": property["address"],
+ "postcode": property["postcode"],
+ "area": resp
+ }
+ )
+ continue
+
+ import plotly.graph_objects as go
+
+ labels = [
+ "House_Cavity_Insulated_Pitched roof_Pre 1970",
+ "House_Cavity_Insulated_Pitched roof_Post 1970",
+ "House_Cavity_Uninsulated_Pitched roof_Pre 1970",
+ "House_Cavity_Uninsulated_Pitched roof_Post 1970",
+ "other",
+ "House_System_Uninsulated_Pitched roof_Pre 1970",
+ "House_Solid_Uninsulated_Not Pitched Roof_Pre 1970"
+ ]
+
+ values = [62, 36, 21, 16, 16, 4, 2]
+
+ hovertext = [
+ "Loft insulation, draft proofing",
+ "Top-up loft insulation",
+ "Cavity wall insulation, loft insulation",
+ "Cavity wall insulation, ventilation",
+ "Bespoke retrofit measures",
+ "External wall insulation, roof insulation",
+ "Flat roof insulation, internal wall insulation"
+ ]
+
+ fig = go.Figure(go.Treemap(
+ labels=labels,
+ parents=[""] * len(labels), # No root
+ values=values,
+ hovertext=hovertext,
+ hoverinfo="text",
+ textinfo="none",
+ marker=dict(
+ line=dict(color="white", width=4),
+ colors=values,
+ colorscale="Blues"
+ )
+ ))
+
+ fig.update_layout(
+ margin=dict(t=10, l=10, r=10, b=10),
+ plot_bgcolor="white",
+ paper_bgcolor="white"
+ )
+
+ fig.show()
+
+ # Get the recommended measures by scenario id
+ recommendation_cols = [c for c in scenario_data[scenario_ids[1]].columns if "Recommendation:" in c]
+ measure_counts_by_scenario = scenario_data[scenario_ids[1]].groupby("archetype_group")[
+ recommendation_cols
+ ].sum().reset_index()
+
+ measure_counts_by_scenario.to_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/measure_counts_by_scenario.csv"
+ )
+
+ # Estimate average valuation improvment by scenarios
+ valuation_data = pd.read_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/property_valuation.csv"
+ )
+
+ from backend.ml_models.Valuation import PropertyValuation
+
+ uplift = []
+ for _, x in valuation_data.iterrows():
+ uprn = x["uprn"]
+
+ to_append = {"uprn": uprn}
+ for _id in scenario_ids:
+ scenario = scenario_data[_id][
+ scenario_data[_id]["uprn"] == uprn
+ ].squeeze()
+
+ val = PropertyValuation.estimate_valuation_improvement(
+ current_value=x["valuation"],
+ current_epc=scenario["Current EPC Rating"].value,
+ target_epc=scenario["Predicted Post Works EPC"],
+ total_cost=None
+ )
+
+ to_append[_id] = val["average_increase"]
+
+ uplift.append(to_append)
+
+ uplift = pd.DataFrame(uplift)
+ print(uplift[scenario_ids[0]].mean())
+ # £8,161
+ print(uplift[scenario_ids[1]].mean())
+ # £16,938
diff --git a/etl/customers/mod/pilot/3. Past Project Costs.py b/etl/customers/mod/pilot/3. Past Project Costs.py
new file mode 100644
index 00000000..79a0493c
--- /dev/null
+++ b/etl/customers/mod/pilot/3. Past Project Costs.py
@@ -0,0 +1,76 @@
+import pandas as pd
+
+# Get the wave 2 costing data and produce some breakdowns
+costs = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/Measure cost study for MOD.xlsx",
+ header=2
+)
+
+# Get the EPC data for these
+
+
+# Cavity
+cwi_costs = costs[
+ ['Model', 'Total invoiced (including VAT)']
+].copy()
+cwi_costs["Model"] = "CWI - " + cwi_costs["Model"]
+cwi_costs = cwi_costs[~pd.isnull(cwi_costs["Total invoiced (including VAT)"])]
+
+# Loft
+li_costs = costs[
+ ['Model.2', 'Total invoiced (including VAT).2']
+].copy()
+li_costs["Model.2"] = "LI - " + li_costs["Model.2"]
+li_costs = li_costs[~pd.isnull(li_costs["Total invoiced (including VAT).2"])]
+# Rename
+li_costs.columns = ["Model", "Total invoiced (including VAT)"]
+
+# Windows
+windows_costs = costs[
+ ['Model.3', 'Total invoiced (including VAT).3']
+].copy()
+windows_costs["Model.3"] = "Windows - " + windows_costs["Model.3"]
+windows_costs = windows_costs[~pd.isnull(windows_costs["Total invoiced (including VAT).3"])]
+# Rename
+windows_costs.columns = ["Model", "Total invoiced (including VAT)"]
+
+# Doors
+doors_costs = costs[
+ ['Model.4', 'Total invoiced (including VAT).4']
+].copy()
+doors_costs["Model.4"] = "Doors - " + doors_costs["Model.4"]
+doors_costs = doors_costs[~pd.isnull(doors_costs["Total invoiced (including VAT).4"])]
+# Rename
+doors_costs.columns = ["Model", "Total invoiced (including VAT)"]
+
+# ASHP
+ashps_costs = costs[
+ ['Model.5', 'Total invoiced (including VAT).5']
+].copy()
+ashps_costs["Model.5"] = "ASHP - " + ashps_costs["Model.5"]
+ashps_costs = ashps_costs[~pd.isnull(ashps_costs["Total invoiced (including VAT).5"])]
+# Rename
+ashps_costs.columns = ["Model", "Total invoiced (including VAT)"]
+
+# Solar
+solar_costs = costs[
+ ['Model.6', 'Total invoiced (including VAT).6']
+].copy()
+solar_costs["Model.6"] = "Solar - " + solar_costs["Model.6"]
+solar_costs = solar_costs[~pd.isnull(solar_costs["Total invoiced (including VAT).6"])]
+# Rename
+solar_costs.columns = ["Model", "Total invoiced (including VAT)"]
+
+fabric_costing_data = pd.concat([cwi_costs, li_costs])
+windows_doors_costing_data = pd.concat([windows_costs, doors_costs])
+
+windows_doors_costing_data.to_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/windows_doors_costs.csv"
+)
+fabric_costing_data.to_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/fabric_costing_data.csv"
+)
+ashps_costs.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/ashps_costs.csv")
+solar_costs.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/MOD/Pilot Programme/solar_costs.csv")
+
+project_cost_by_age = costs[["Property age ", "TOTAL Cost of Works"]].groupby("Property age ").mean().reset_index()
diff --git a/etl/customers/panacap/assets.py b/etl/customers/panacap/assets.py
new file mode 100644
index 00000000..ec57d9a4
--- /dev/null
+++ b/etl/customers/panacap/assets.py
@@ -0,0 +1,61 @@
+import os
+
+import pandas as pd
+from dotenv import load_dotenv
+
+from etl.spatial.OpenUprnClient import OpenUprnClient
+from etl.route_march_data_pull.app import get_data
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+addresses = [
+ {"address": "3 Willis Road", "postcode": "CB1 2AQ"},
+ {"address": "22 Catharine Street", "postcode": "CB1 3AW"},
+ {"address": "332 Mill Road", "postcode": "CB1 3NN"},
+ {"address": "330 Mill Road", "postcode": "CB1 3NN"},
+ {"address": "328 Mill Road", "postcode": "CB1 3NN"},
+ {"address": "71 Mill Road", "postcode": "CB1 2AS"},
+ {"address": "78 Argyle Street", "postcode": "CB1 3LZ"},
+ {"address": "9 Graham Road", "postcode": "CB4 2ZE"},
+ {"address": "217 Mill Road", "postcode": "CB1 3BE"},
+ {"address": "374 Mill Road", "postcode": "CB1 3NN"},
+ {"address": "174 Thoday Street", "postcode": "CB1 3AX"},
+ {"address": "37 Abbey Road", "postcode": "CB5 8HH"},
+ {"address": "18 Upper Gwydir Street", "postcode": "CB1 2LR"},
+ {"address": "21 Fulbourn Road Fulbourn", "postcode": "CB1 9JL"},
+ {"address": "108 Argyle Street", "postcode": "CB1 3LS"},
+ {"address": "115 Victoria Road", "postcode": "CB4 3BS"},
+ {"address": "55 Ross Street", "postcode": "CB1 3BP"},
+ {"address": "16 Kingston Street", "postcode": "CB1 2NU"},
+ {"address": "13 Thoday Street", "postcode": "CB1 3AS"},
+ {"address": "103 York Street", "postcode": "CB1 2PZ"},
+]
+
+asset_list = pd.DataFrame(addresses)
+asset_list["row_id"] = asset_list.index
+
+epc_data, _, _ = get_data(
+ asset_list=asset_list, fulladdress_column="address", postcode_column="postcode", address1_column="address",
+ manual_uprn_map={}, epc_api_only=True
+)
+
+epc_df = pd.DataFrame(epc_data)
+epc_df.shape
+
+asset_list = asset_list.merge(
+ epc_df, how="left", on="row_id"
+)
+
+asset_list = asset_list.rename(columns={"address_x": "Address", "postcode_x": "Postcode"})
+asset_list["uprn"] = asset_list["uprn"].astype(str)
+
+spatial_data = OpenUprnClient.get_spatial_data([x["uprn"] for x in epc_data], bucket_name="retrofit-data-dev")
+spatial_data["UPRN"] = spatial_data["UPRN"].astype(str)
+
+asset_list = asset_list.merge(
+ spatial_data, how="left", left_on="uprn", right_on="UPRN"
+)
+
+asset_list.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Panacap/Acquisitions EPC Data.csv",
+ index=False)
diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py
index a0d01f7d..a8805a71 100644
--- a/etl/customers/remote_assessments/app.py
+++ b/etl/customers/remote_assessments/app.py
@@ -1,9 +1,15 @@
+import os
import pandas as pd
+from dotenv import load_dotenv
from utils.s3 import save_csv_to_s3
+from etl.find_my_epc.AssetListEpcData import AssetListEpcData
-PORTFOLIO_ID = 111
+PORTFOLIO_ID = 141
USER_ID = 8
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
def app():
"""
@@ -13,10 +19,21 @@ def app():
asset_list = [
{
- "uprn": 100050770761,
- "address": "12 Sheardown Street",
- "postcode": "DN4 0BH"
- }
+ "address": "196 Merrow Street",
+ "postcode": "SE17 2NP",
+ "uprn": 200003423454,
+ "patch": True
+ },
+ {
+ "address": "65 Liverpool Grove",
+ "postcode": "SE17 2HP",
+ "uprn": 200003423194
+ },
+ {
+ "address": "2 Brettell Street",
+ "postcode": "SE17 2NZ",
+ "uprn": 200003423607
+ },
]
asset_list = pd.DataFrame(asset_list)
@@ -28,30 +45,46 @@ def app():
file_name=filename
)
- non_invasive_recommendations = [
- {
- "uprn": 100050770761,
- "recommendations": [
- {
- "type": "extension_cavity_wall_insulation",
- "sap_points": 2,
- }
- ]
- }
- ]
+ # Pull the non-invasive recommendations automatically
+ asset_list_epc_client = AssetListEpcData(
+ asset_list=asset_list,
+ epc_auth_token=EPC_AUTH_TOKEN
+ )
+ asset_list_epc_client.get_data()
+ asset_list_epc_client.get_non_invasive_recommendations()
+ asset_list_epc_client.get_patch()
+
# Store non-invasive recommendations in S3
non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv"
save_csv_to_s3(
- dataframe=pd.DataFrame(non_invasive_recommendations),
+ dataframe=pd.DataFrame(asset_list_epc_client.non_invasive_recommendations),
bucket_name="retrofit-plan-inputs-dev",
file_name=non_invasive_recommendations_filename
)
+ # Store patches in S3
+ patches_filename = ""
+ if asset_list_epc_client.patches:
+ patches_filename = f"{USER_ID}/{PORTFOLIO_ID}/patches.csv"
+ save_csv_to_s3(
+ dataframe=pd.DataFrame(asset_list_epc_client.patches),
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=patches_filename
+ )
+
valuation_data = [
{
- "uprn": 100050770761,
- "value": 67_000
- }
+ "valuation": 339_000,
+ "uprn": 200003423454,
+ },
+ {
+ "valuation": 374_000,
+ "uprn": 200003423194
+ },
+ {
+ "valuation": 719_000,
+ "uprn": 200003423607
+ },
]
# Store valuation data to s3
valuation_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuation.csv"
@@ -68,7 +101,7 @@ def app():
"goal_value": "C",
"trigger_file_path": filename,
"already_installed_file_path": "",
- "patches_file_path": "",
+ "patches_file_path": patches_filename,
"non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
"valuation_file_path": valuation_filename,
"scenario_name": "Full package remote assessment",
diff --git a/etl/customers/settle/route_march_2024_11_08.py b/etl/customers/settle/route_march_2024_11_08.py
new file mode 100644
index 00000000..21b6f2df
--- /dev/null
+++ b/etl/customers/settle/route_march_2024_11_08.py
@@ -0,0 +1,226 @@
+import os
+import time
+
+import pandas as pd
+from tqdm import tqdm
+
+from dotenv import load_dotenv
+from utils.s3 import read_excel_from_s3
+from backend.SearchEpc import SearchEpc
+from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
+
+from recommendations.recommendation_utils import (
+ estimate_perimeter,
+ estimate_external_wall_area,
+ estimate_number_of_floors
+)
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+
+def get_data(asset_list):
+ epc_data = []
+ errors = []
+ for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
+ try:
+ postcode = home["Postcode"]
+ house_number = home["AddressLine1"]
+ full_address = ", ".join([home["AddressLine1"], home["AddressLine4"], home["AddressLine5"]])
+
+ searcher = SearchEpc(
+ address1=str(house_number),
+ postcode=postcode,
+ auth_token=EPC_AUTH_TOKEN,
+ os_api_key="",
+ property_type=None,
+ fast=True,
+ full_address=full_address,
+ max_retries=5
+ )
+ # Force the skipping of estimating the EPC
+ searcher.ordnance_survey_client.property_type = None
+ searcher.ordnance_survey_client.built_form = None
+
+ searcher.find_property(skip_os=True)
+ if searcher.newest_epc is None:
+ continue
+
+ # Look for EPC recommendatons
+ try:
+ property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
+ except:
+ property_recommendations = {"rows": []}
+
+ epc = {
+ "row_id": home["row_id"],
+ **searcher.newest_epc.copy(),
+ "recommendations": property_recommendations["rows"]
+ }
+
+ epc_data.append(epc)
+ except Exception as e:
+ errors.append(home["row_id"])
+ time.sleep(5)
+
+ return epc_data, errors
+
+
+def app():
+ """
+ This app is EPC pulling data for some properties owned by Livewest
+
+ Data request contents:
+ Date of last EPC
+ Reason for EPC
+ SAP score on register
+ Property Type
+ Property Area
+ Property Age
+ Any Dimensions (HLP,PW,RH)
+ Property Wall Construction
+ Heating Type
+ Secondary Heating
+ Loft Insulation Depth
+
+ Additional if possible:
+ Heat loss calculations
+ EPC recommendations
+ Property UPRN
+
+ """
+ asset_list = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Settle/SETTLE FULL PROPOSED PROGRAMME.xlsx",
+ header=0
+ )
+ asset_list["row_id"] = asset_list.index
+
+ epc_data, errors = get_data(asset_list)
+
+ # We now retrieve any failed properties
+ asset_list_failed = asset_list[asset_list["row_id"].isin(errors)]
+ epc_data_failed, _ = get_data(asset_list_failed)
+
+ # Append the failed data to the main data
+ epc_data.extend(epc_data_failed)
+
+ epc_df = pd.DataFrame(epc_data)
+
+ # We expand out the recommendations
+ recommendations_df = epc_df[["row_id", "recommendations"]]
+
+ unique_recommendations = set()
+ for _, row in recommendations_df.iterrows():
+ unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]])
+
+ columns = ["row_id"] + list(unique_recommendations)
+ transformed_data = []
+ for _, row in recommendations_df.iterrows():
+ # Initialize a dictionary for this row with False for all recommendations
+ row_data = {col: False for col in columns}
+ row_data["row_id"] = row["row_id"]
+
+ # Set True for each recommendation present in this row
+ for rec in row["recommendations"]:
+ recommendation_text = rec["improvement-summary-text"]
+ row_data[recommendation_text] = True
+
+ # Append the row data to transformed_data
+ transformed_data.append(row_data)
+
+ transformed_df = pd.DataFrame(transformed_data)
+ # Drop the column that is ""
+ transformed_df = transformed_df.drop(columns=[""])
+
+ # Retrieve just the data we need
+ epc_df = epc_df[
+ [
+ "row_id",
+ "uprn",
+ "property-type",
+ "built-form",
+ "inspection-date",
+ "current-energy-rating",
+ "current-energy-efficiency",
+ "roof-description",
+ "walls-description",
+ "transaction-type",
+ # New fields needed
+ "secondheat-description",
+ "total-floor-area",
+ "construction-age-band",
+ "floor-height",
+ "number-habitable-rooms",
+ "mainheat-description",
+ #
+ "energy-consumption-current", # kwh/m2
+ ]
+ ]
+
+ asset_list = asset_list.merge(
+ epc_df,
+ how="left",
+ on="row_id"
+ ).merge(
+ transformed_df,
+ how="left",
+ on="row_id"
+ )
+
+ asset_list = asset_list.drop(columns=["row_id"])
+
+ # Rename the columns
+ asset_list = asset_list.rename(columns={
+ "inspection-date": "Date of last EPC",
+ "current-energy-efficiency": "SAP score on register",
+ "current-energy-rating": "EPC rating on register",
+ "property-type": "Property Type",
+ "built-form": "Archetype",
+ "total-floor-area": "Property Floor Area",
+ "construction-age-band": "Property Age Band",
+ "floor-height": "Property Floor Height",
+ "number-habitable-rooms": "Number of Habitable Rooms",
+ "walls-description": "Wall Construction",
+ "roof-description": "Roof Construction",
+ "mainheat-description": "Heating Type",
+ "secondheat-description": "Secondary Heating",
+ "transaction-type": "Reason for last EPC",
+ "energy-consumption-current": "Heat Demand (kWh/m2)"
+ })
+
+ asset_list["Estimated Number of Floors"] = asset_list.apply(
+ lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull(
+ x["Property Type"]) else None, axis=1
+ )
+
+ asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float)
+ # Replace "" value with None
+ asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None)
+ asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float)
+
+ asset_list["Estimated Perimeter (m)"] = asset_list.apply(
+ lambda x: estimate_perimeter(
+ floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"],
+ num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"],
+ ), axis=1
+ )
+
+ asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply(
+ lambda x: estimate_external_wall_area(
+ num_floors=x["Estimated Number of Floors"],
+ floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5,
+ perimeter=x["Estimated Perimeter (m)"],
+ built_form=x["Archetype"]
+ ),
+ axis=1
+ )
+
+ asset_list["Roof Insulation Thickness"] = asset_list.apply(
+ lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull(
+ x["Roof Construction"]) else None,
+ axis=1
+ )
+
+ # Store as an excel
+ filename = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Settle/Settle EPC Data pull - 08 Nov 2024.xlsx"
+ asset_list.to_excel(filename, index=False)
diff --git a/etl/customers/southend/epc_data_pull_2024_11_14.py b/etl/customers/southend/epc_data_pull_2024_11_14.py
new file mode 100644
index 00000000..11ddcc6f
--- /dev/null
+++ b/etl/customers/southend/epc_data_pull_2024_11_14.py
@@ -0,0 +1,231 @@
+import os
+import time
+
+import pandas as pd
+from tqdm import tqdm
+
+from dotenv import load_dotenv
+from utils.s3 import read_excel_from_s3
+from backend.SearchEpc import SearchEpc
+from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
+
+from recommendations.recommendation_utils import (
+ estimate_perimeter,
+ estimate_external_wall_area,
+ estimate_number_of_floors
+)
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+
+def get_data(asset_list):
+ epc_data = []
+ errors = []
+ for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
+ try:
+ postcode = home["Postcode"]
+ address1 = home["address1"].split(",")[0]
+ full_address = home["Address"]
+
+ searcher = SearchEpc(
+ address1=str(address1),
+ postcode=postcode,
+ auth_token=EPC_AUTH_TOKEN,
+ os_api_key="",
+ property_type=None,
+ fast=True,
+ full_address=full_address,
+ max_retries=5
+ )
+ # Force the skipping of estimating the EPC
+ searcher.ordnance_survey_client.property_type = None
+ searcher.ordnance_survey_client.built_form = None
+
+ searcher.find_property(skip_os=True)
+ if searcher.newest_epc is None:
+ continue
+
+ # Look for EPC recommendatons
+ try:
+ property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
+ except:
+ property_recommendations = {"rows": []}
+
+ epc = {
+ "row_id": home["row_id"],
+ **searcher.newest_epc.copy(),
+ "recommendations": property_recommendations["rows"]
+ }
+
+ epc_data.append(epc)
+ except Exception as e:
+ errors.append(home["row_id"])
+ time.sleep(5)
+
+ return epc_data, errors
+
+
+def app():
+ """
+ This app is EPC pulling data for some properties owned by Livewest
+
+ Data request contents:
+ Date of last EPC
+ Reason for EPC
+ SAP score on register
+ Property Type
+ Property Area
+ Property Age
+ Any Dimensions (HLP,PW,RH)
+ Property Wall Construction
+ Heating Type
+ Secondary Heating
+ Loft Insulation Depth
+
+ Additional if possible:
+ Heat loss calculations
+ EPC recommendations
+ Property UPRN
+
+ """
+ asset_list = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/southend/Southend Planned programme.xlsx",
+ header=0,
+ sheet_name="Planned RM"
+ )
+ asset_list["row_id"] = asset_list.index
+ asset_list["address1"] = asset_list["Address"].str.split(",").str[0]
+
+ epc_data, errors = get_data(asset_list)
+
+ # We now retrieve any failed properties
+ asset_list_failed = asset_list[asset_list["row_id"].isin(errors)]
+ epc_data_failed, _ = get_data(asset_list_failed)
+
+ # Append the failed data to the main data
+ epc_data.extend(epc_data_failed)
+
+ epc_df = pd.DataFrame(epc_data)
+
+ # We expand out the recommendations
+ recommendations_df = epc_df[["row_id", "recommendations"]]
+
+ unique_recommendations = set()
+ for _, row in recommendations_df.iterrows():
+ unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]])
+
+ columns = ["row_id"] + list(unique_recommendations)
+ transformed_data = []
+ for _, row in recommendations_df.iterrows():
+ # Initialize a dictionary for this row with False for all recommendations
+ row_data = {col: False for col in columns}
+ row_data["row_id"] = row["row_id"]
+
+ # Set True for each recommendation present in this row
+ for rec in row["recommendations"]:
+ recommendation_text = rec["improvement-summary-text"]
+ row_data[recommendation_text] = True
+
+ # Append the row data to transformed_data
+ transformed_data.append(row_data)
+
+ transformed_df = pd.DataFrame(transformed_data)
+ # Drop the column that is ""
+ transformed_df = transformed_df.drop(columns=[""])
+
+ # Retrieve just the data we need
+ epc_df = epc_df[
+ [
+ "row_id",
+ "uprn",
+ "property-type",
+ "built-form",
+ "inspection-date",
+ "current-energy-rating",
+ "current-energy-efficiency",
+ "roof-description",
+ "walls-description",
+ "transaction-type",
+ # New fields needed
+ "secondheat-description",
+ "total-floor-area",
+ "construction-age-band",
+ "floor-height",
+ "number-habitable-rooms",
+ "mainheat-description",
+ #
+ "energy-consumption-current", # kwh/m2
+ "photo-supply",
+ ]
+ ]
+
+ asset_list = asset_list.merge(
+ epc_df,
+ how="left",
+ on="row_id"
+ ).merge(
+ transformed_df,
+ how="left",
+ on="row_id"
+ )
+
+ asset_list = asset_list.drop(columns=["row_id"])
+
+ # Rename the columns
+ asset_list = asset_list.rename(columns={
+ "inspection-date": "Date of last EPC",
+ "current-energy-efficiency": "SAP score on register",
+ "current-energy-rating": "EPC rating on register",
+ "property-type": "Property Type",
+ "built-form": "Archetype",
+ "total-floor-area": "Property Floor Area",
+ "construction-age-band": "Property Age Band",
+ "floor-height": "Property Floor Height",
+ "number-habitable-rooms": "Number of Habitable Rooms",
+ "walls-description": "Wall Construction",
+ "roof-description": "Roof Construction",
+ "mainheat-description": "Heating Type",
+ "secondheat-description": "Secondary Heating",
+ "transaction-type": "Reason for last EPC",
+ "energy-consumption-current": "Heat Demand (kWh/m2)",
+ "photo-supply": "% of the Roof with PV"
+ })
+
+ asset_list["Estimated Number of Floors"] = asset_list.apply(
+ lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull(
+ x["Property Type"]) else None, axis=1
+ )
+
+ asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float)
+ # Replace "" value with None
+ asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None)
+ asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float)
+
+ asset_list["Estimated Perimeter (m)"] = asset_list.apply(
+ lambda x: estimate_perimeter(
+ floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"],
+ num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"],
+ ), axis=1
+ )
+
+ asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply(
+ lambda x: estimate_external_wall_area(
+ num_floors=x["Estimated Number of Floors"],
+ floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5,
+ perimeter=x["Estimated Perimeter (m)"],
+ built_form=x["Archetype"]
+ ),
+ axis=1
+ )
+
+ asset_list["Roof Insulation Thickness"] = asset_list.apply(
+ lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull(
+ x["Roof Construction"]) else None,
+ axis=1
+ )
+
+ # Store as an excel
+ filename = ("/Users/khalimconn-kowlessar/Documents/hestia/Customers/southend/southend EPC Data pull - 14 Nov "
+ "2024.xlsx")
+ asset_list.to_excel(filename, index=False)
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
new file mode 100644
index 00000000..95fe4fcd
--- /dev/null
+++ b/etl/customers/stonewater/Wave 3 Preparation.py
@@ -0,0 +1,4293 @@
+import os
+from urllib import parse
+from fuzzywuzzy import fuzz
+
+import PyPDF2
+import re
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from collections import Counter
+from scipy.optimize import linprog
+
+from SearchEpc import SearchEpc
+from utils.s3 import read_pickle_from_s3
+
+CUSTOMER_FOLDER_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater"
+SURVEY_FOLDERS = os.path.join(CUSTOMER_FOLDER_PATH, "StonewaterSurveys_{i}")
+NUM_FOLDERS = 15
+
+
+def sap_to_epc(sap_points: int | float):
+ """
+ Simple utility function to convert SAP points to EPC rating.
+ :param sap_points: numerical value of SAP points, typically between 0 and 100
+ :return:
+ """
+
+ if sap_points <= 0:
+ raise ValueError("SAP points should be above 0.")
+
+ if sap_points >= 92:
+ return "A"
+ elif sap_points >= 81:
+ return "B"
+ elif sap_points >= 69:
+ return "C"
+ elif sap_points >= 55:
+ return "D"
+ elif sap_points >= 39:
+ return "E"
+ elif sap_points >= 21:
+ return "F"
+ else:
+ return "G"
+
+
+def extract_wall_details_summary(text):
+ """
+ Extracts wall type, insulation, dry-lining, and thickness for each building part,
+ including any alternative wall details within the 7.0 Walls section of the summary PDF text.
+ """
+ # Define data structure to hold all building part wall entries
+ wall_data = []
+
+ # Locate the entire 7.0 Walls section
+ wall_section = re.search(r"7\.0 Walls:\n(.*?)\n8\.0 Roofs:", text, re.DOTALL).group(1)
+
+ # Define pattern to match each building part's wall entry within the section
+ building_part_pattern = re.compile(
+ r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label
+ r"Type\s+(.*?)\n" # Matches main wall Type
+ r"Insulation\s+(.*?)\n" # Matches main wall Insulation
+ r"(Dry-lining\s+(.*?)\n)?" # Optional main wall Dry-lining
+ r"Wall Thickness Unknown\s+(.*?)\n" # Matches main wall Thickness Unknown
+ r"Wall Thickness \[mm\]\s+(\d+)", # Matches main wall Thickness
+ re.DOTALL
+ )
+
+ # Define pattern to capture alternative wall details, if present
+ alternative_wall_pattern = re.compile(
+ r"Alternative Wall Area.*?\n" # Matches start of alternative wall section
+ r"Alternative Type\s+(.*?)\n" # Matches alternative wall Type
+ r"Alternative Insulation\s+(.*?)\n" # Matches alternative wall Insulation
+ r"(Alternative Dry-lining\s+(.*?)\n)?" # Optional Alternative Dry-lining
+ r"Alternative Wall Thickness Unknown\s+(.*?)\n" # Matches alternative wall Thickness Unknown
+ r"Alternative Wall Thickness\s+(\d+)", # Matches alternative wall Thickness
+ re.DOTALL
+ )
+
+ # Find all building part entries within the 7.0 Walls section
+ for match in building_part_pattern.finditer(wall_section):
+ wall_label = match.group(1).strip()
+ main_wall_type = match.group(2).strip()
+ main_wall_insulation = match.group(3).strip()
+ main_wall_dry_lining = match.group(5).strip() if match.group(5) else "N/A"
+ main_wall_thickness_unknown = match.group(6).strip()
+ main_wall_thickness = int(match.group(7))
+
+ # Initialize dictionary for this wall entry
+ wall_entry = {
+ "Building Part": wall_label,
+ "Wall Type": main_wall_type,
+ "Wall Insulation": main_wall_insulation,
+ "Wall Dry-lining": main_wall_dry_lining,
+ "Wall Thickness Unknown": main_wall_thickness_unknown,
+ "Wall Thickness (mm)": main_wall_thickness,
+ "Alternative Wall Type": None,
+ "Alternative Wall Insulation": None,
+ "Alternative Wall Dry-lining": "N/A",
+ "Alternative Wall Thickness Unknown": None,
+ "Alternative Wall Thickness (mm)": None,
+ }
+
+ # Check if there's an alternative wall section following this wall entry
+ alt_match = alternative_wall_pattern.search(wall_section, match.end())
+ if alt_match:
+ wall_entry["Alternative Wall Type"] = alt_match.group(1).strip()
+ wall_entry["Alternative Wall Insulation"] = alt_match.group(2).strip()
+ wall_entry["Alternative Wall Dry-lining"] = alt_match.group(4).strip() if alt_match.group(4) else "N/A"
+ wall_entry["Alternative Wall Thickness Unknown"] = alt_match.group(5).strip()
+ wall_entry["Alternative Wall Thickness (mm)"] = int(alt_match.group(6))
+
+ # Append each building part as a dictionary in the wall_data list
+ wall_data.append(wall_entry)
+
+ return wall_data
+
+
+def extract_summary_report(pdf_path):
+ """
+ Extracts specific data from the provided PDF file.
+ Data includes:
+ - Current SAP rating
+ - Fuel Bill
+ - Address
+ """
+
+ data = {
+ "Address": None,
+ "Postcode": None,
+ "Current SAP Rating": None,
+ "Current EPC Band": None,
+ "Fuel Bill": None,
+ "Main Building Age Band": None,
+ "Number of Storeys": None,
+ "Window Age Description": None,
+ "Window Age Description Proportion (%)": None,
+ "Secondary Window Age Description": None,
+ "Secondary Window Age Description Proportion (%)": None,
+ "Number of Windows": None,
+ "Total Number of Doors": None,
+ "Number of Insulated Doors": None,
+ "Existing Primary Heating System": None,
+ "Existing Primary Heating PCDF Reference": None,
+ "Existing Primary Heating Controls": None,
+ "Existing Primary Heating % of Heat": None,
+ "Existing Secondary Heating System": None,
+ "Existing Secondary Heating PCDF Reference": None,
+ "Existing Secondary Heating Controls": None,
+ "Existing Secondary Heating % of Heat": None,
+ "Secondary Heating Code": None,
+ "Water Heating Code": None,
+ 'Total Floor Area (m2)': None,
+ 'Total Ground Floor Area (m2)': None,
+ 'RIR Floor Area': None,
+ 'Main Building Wall Area (m2)': None,
+ 'First Extension Wall Area (m2)': None,
+ "Number of Light Fittings": None,
+ "Number of LEL Fittings": None,
+ "Number of fittings needing LEL": None,
+ "Main Roof Type": None,
+ "Main Roof Insulation": None,
+ "Main Roof Insulation Thickness": None,
+ "Main Wall Type": None,
+ "Main Wall Insulation": None,
+ "Main Wall Dry-lining": None,
+ "Main Wall Thickness": None,
+ "Main Building Alternative Wall Type": None,
+ "Main Building Alternative Wall Insulation": None,
+ "Main Building Alternative Wall Dry-lining": None,
+ "Main Building Alternative Wall Thickness": None,
+ }
+
+ with (open(pdf_path, "rb") as file):
+ reader = PyPDF2.PdfReader(file)
+ text = ""
+ for page in reader.pages:
+ text += page.extract_text()
+
+ # Extract Current SAP rating
+ sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)
+ data["Current SAP Rating"] = sap_match.group(1).split(" ")[1]
+
+ # Extract age
+ age_band_match = re.search(
+ r"3\.0 Date Built:\s*Main Property\s*[A-Z]?\s*(\d{4}-\d{4}|before \d{4}|\d{4} onwards)",
+ text
+ )
+ data["Main Building Age Band"] = age_band_match.group(1)
+
+ # Number of storeys
+ storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
+ data["Number of Storeys"] = int(storeys_match.group(1))
+
+ # Extract Carbon Emissions
+ # carbon_match = re.search(r"Emissions \(t/year\):\s*([\d.]+)\s*tonnes", text)
+ # data["Carbon Emissions (t/year)"] = float(carbon_match.group(1))
+
+ # Extract Fuel Bill
+ fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text)
+ data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
+
+ # Extract individual address components
+ postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text)
+ # region = re.search(r"Region:\s*(.*?)\nHouse Name:", text)
+ house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text)
+ house_no = re.search(r"House No:\s*(.*?)\nStreet:", text)
+ street = re.search(r"Street:\s*(.*?)\nLocality:", text)
+ locality = re.search(r"Locality:\s*(.*?)\nTown:", text)
+ town = re.search(r"Town:\s*(.*?)\nCounty:", text)
+ county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text)
+
+ # Clean extracted values and remove any prefixes
+ address_parts = [
+ house_no.group(1).strip() if house_no else "",
+ house_name.group(1).strip() if house_name else "",
+ street.group(1).strip() if street else "",
+ locality.group(1).strip() if locality else "",
+ town.group(1).strip() if town else "",
+ county.group(1).strip() if county else "",
+ postcode.group(1).strip() if postcode else ""
+ ]
+
+ # Join non-empty parts with a comma
+ data["Address"] = ", ".join([part for part in address_parts if part])
+ data["Postcode"] = postcode.group(1).strip()
+
+ windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
+ windows_text = windows_section.group(1)
+ window_data = extract_window_age_description(windows_text)
+ data.update(window_data)
+
+ # Extract Total Number of Doors
+ total_doors_match = re.search(r"Total Number of Doors\s*(\d+)", text)
+ data["Total Number of Doors"] = int(total_doors_match.group(1))
+
+ # Extract Number of Insulated Doors
+ insulated_doors_match = re.search(r"Number of Insulated Doors\s*(\d+)", text)
+ data["Number of Insulated Doors"] = int(insulated_doors_match.group(1))
+
+ # Extract heating system
+ # Extract Primary Heating Data
+ # Extract Primary Heating Section
+ primary_heating_section1 = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL)
+ primary_heating_section2 = re.search(r"Main\s*Heating1\s*(.*?)\s*Water\s*Heating", text, re.DOTALL)
+ primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2
+
+ primary_text = primary_heating_section.group(1)
+
+ data["Existing Primary Heating System"] = re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group(
+ 1).strip()
+ data["Existing Primary Heating PCDF Reference"] = re.search(
+ r"PCDF boiler Reference\s*(\d+)", primary_text
+ ).group(1)
+ data["Existing Primary Heating Controls"] = re.search(
+ r"Main Heating Controls\s*(.*?)\n", primary_text
+ ).group(1).strip()
+ data["Existing Primary Heating % of Heat"] = int(
+ re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1)
+ )
+
+ # Extract Secondary Heating Section
+ secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL)
+
+ if secondary_heating_section is None:
+ data["Existing Secondary Heating System"] = ""
+ data["Existing Secondary Heating PCDF Reference"] = ""
+ data["Existing Secondary Heating Controls"] = ""
+ data["Existing Secondary Heating % of Heat"] = 0
+
+ else:
+ secondary_text = secondary_heating_section.group(1)
+
+ main_heating_code_match_secondary = re.search(
+ r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text
+ )
+ data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip()
+ data["Existing Secondary Heating PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)",
+ secondary_text).group(1)
+ second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text)
+ data["Existing Secondary Heating Controls"] = (
+ second_heating_controls_match.group(1).strip() if second_heating_controls_match else ""
+ )
+ data["Existing Secondary Heating % of Heat"] = int(
+ re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1)
+ )
+
+ # Extract Secondary Heating and Water Heating Codes
+ secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text)
+ water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text)
+
+ if data["Existing Secondary Heating System"] == "":
+ data["Secondary Heating Code"] = ""
+ else:
+ data["Secondary Heating Code"] = secondary_heating_code_match.group(
+ 1).strip() if secondary_heating_code_match else ""
+
+ data["Water Heating Code"] = water_heating_code_match.group(1).strip()
+
+ dimensions = extract_building_parts_summary(text)
+ data.update(dimensions)
+
+ data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1))
+ data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1))
+ data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"]
+
+ extracted_roof_data = extract_roof_details_summary(text)
+ main_roof_data = [roof for roof in extracted_roof_data if "Main" in roof["Building Part"]][0]
+ data["Main Roof Type"] = main_roof_data["Roof Type"]
+ data["Main Roof Insulation"] = main_roof_data["Roof Insulation"]
+ data["Main Roof Insulation Thickness"] = main_roof_data["Roof Insulation Thickness"]
+
+ walls_data = extract_wall_details_summary(text)
+ # Get the main building wall data
+ main_building_walls = [wall for wall in walls_data if "Main" in wall["Building Part"]][0]
+ data["Main Wall Type"] = main_building_walls["Wall Type"]
+ data["Main Wall Insulation"] = main_building_walls["Wall Insulation"]
+ data["Main Wall Dry-lining"] = main_building_walls["Wall Dry-lining"]
+ data["Main Wall Thickness"] = main_building_walls["Wall Thickness (mm)"]
+ data["Main Building Alternative Wall Type"] = main_building_walls["Alternative Wall Type"]
+ data["Main Building Alternative Wall Insulation"] = main_building_walls["Alternative Wall Insulation"]
+ data["Main Building Alternative Wall Dry-lining"] = main_building_walls["Alternative Wall Dry-lining"]
+ data["Main Building Alternative Wall Thickness"] = main_building_walls["Alternative Wall Thickness (mm)"]
+
+ return data
+
+
+def extract_window_age_description(windows_text):
+ """
+ Extracts the most common window age description and its proportion.
+
+ Parameters:
+ windows_text (str): The text section containing window data.
+
+ Returns:
+ dict: A dictionary with the most common window age description and its proportion.
+ """
+ # Clean up windows_text by removing line breaks for better pattern matching
+ windows_text = windows_text.replace("\n", "")
+
+ # Define possible window age descriptions
+ window_descriptions = [
+ "Double post or during 2002",
+ "Double pre 2002",
+ "Double with unknown install date",
+ "Secondary glazing",
+ "Triple glazing",
+ "Single glazing",
+ ]
+
+ # Count occurrences of each description
+ description_counts = Counter()
+ for description in window_descriptions:
+ matches = re.findall(re.escape(description), windows_text)
+ description_counts[description] = len(matches)
+
+ if not description_counts or not sum(description_counts.values()):
+ raise ValueError("Failed to extract window data.")
+
+ # Determine the most common description and calculate its proportion
+ most_common_description, window_count = description_counts.most_common(1)[0]
+ window_proportion = window_count / sum(description_counts.values()) * 100
+
+ # Get the second most common and the proportion
+ if window_proportion == 100:
+ second_most_common_description = None
+ second_most_common_proportion = 0
+ else:
+ second_most_common_description, second_window_count = description_counts.most_common(2)[1]
+ second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100
+
+ return {
+ "Window Age Description": most_common_description,
+ "Window Age Description Proportion (%)": window_proportion,
+ "Secondary Window Age Description": second_most_common_description,
+ "Secondary Window Age Description Proportion (%)": second_most_common_proportion,
+ "Number of Windows": sum(description_counts.values())
+ }
+
+
+def extract_building_parts_epr(text):
+ """
+ Extracts building parts and associated dimensions from the provided PDF text.
+ Each building part (main and extensions) includes floor area, room height, perimeter, and party wall length.
+ Handles cases where 'Room(s) in Roof area' appears within the part_name with only the Floor Area information.
+ """
+ data = []
+
+ # Pattern to locate each "Building part" section
+ building_part_pattern = re.compile(
+ r"Construction details: Building part: (.*?)\nFloor Area \[m2\] Room Height \[m\] Perimeter \[m\] Party "
+ r"Wall Length \[m\]\n(.*?)(?=Construction details|Data inputs|$)",
+ re.DOTALL
+ )
+
+ # Extract each building part
+ for match in building_part_pattern.finditer(text):
+ part_name = match.group(1).strip()
+ floor_data = match.group(2)
+
+ # Check for "Room(s) in Roof area" within the part_name
+ room_in_roof_match = re.search(r"Room\(s\) in Roof area:\s*([\d.]+)", part_name)
+ if room_in_roof_match:
+ # Extract Room in Roof area and add it as a separate entry
+ floor_area = float(room_in_roof_match.group(1))
+ # Clean up part name to exclude "Room(s) in Roof area" from the building part name
+ cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip()
+ data.append({
+ "Building Part": cleaned_part_name,
+ "Floor Level": "Room in Roof",
+ "Floor Area (m2)": floor_area,
+ "Room Height (m)": None, # Placeholder for missing data
+ "Perimeter (m)": None, # Placeholder for missing data
+ "Party Wall Length (m)": None # Placeholder for missing data
+ })
+ else:
+ # Clean up part name to keep only the descriptor (e.g., "Main" or "1st Extension")
+ cleaned_part_name = re.sub(r" - built in.*", "", part_name).strip()
+
+ # Pattern to match each floor's measurements in standard cases
+ floor_pattern = re.compile(
+ r"(Lowest floor|First floor|Second floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
+ )
+
+ # Extract floor details for each building part
+ for floor_match in floor_pattern.finditer(floor_data):
+ floor_level = floor_match.group(1)
+ floor_area = float(floor_match.group(2))
+ room_height = float(floor_match.group(3))
+ perimeter = float(floor_match.group(4))
+ party_wall_length = float(floor_match.group(5))
+
+ # Append to data
+ data.append({
+ "Building Part": cleaned_part_name,
+ "Floor Level": floor_level,
+ "Floor Area (m2)": floor_area,
+ "Room Height (m)": room_height,
+ "Perimeter (m)": perimeter,
+ "Party Wall Length (m)": party_wall_length
+ })
+
+ # Aggregated data calculation
+ main_building = [part for part in data if "Main" in part["Building Part"]]
+ first_extension = [part for part in data if "1st Extension" in part["Building Part"]]
+ dimensions = {
+ "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]),
+ "Total Ground Floor Area (m2)": sum(
+ [part["Floor Area (m2)"] for part in data if "Lowest floor" in part["Floor Level"]]
+ ),
+ "RIR Floor Area": sum(
+ [part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]]
+ ),
+ "Main Building Wall Area (m2)": sum(
+ [x["Perimeter (m)"] * x["Room Height (m)"] for x in main_building if
+ x["Perimeter (m)"] and x["Room Height (m)"]]
+ ),
+ "First Extension Wall Area (m2)": sum(
+ [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extension if
+ x["Perimeter (m)"] and x["Room Height (m)"]]
+ ) if first_extension else 0,
+ }
+
+ return dimensions
+
+
+def extract_building_parts_summary(text):
+ """
+ Extracts building parts and associated dimensions from the summary report PDF.
+ This includes Main Property, multiple extensions if they exist, and Room in Roof areas.
+ """
+ data = []
+
+ # Locate the Dimensions section
+ dimensions_section = re.search(
+ r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL
+ )
+ if not dimensions_section:
+ dimensions_section = re.search(
+ r"Dimensions:\s*Dimension type: External\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL
+ )
+ if not dimensions_section:
+ raise ValueError("Failed to locate dimensions section in the text.")
+
+ dimensions_text = dimensions_section.group(1)
+
+ # Pattern to extract each building part, starting from Main Property and including extensions
+ building_part_pattern = re.compile(
+ r"(Main Property|\d+(?:st|nd|rd|th) Extension)\s*"
+ r"(.*?)(?=\d+(?:st|nd|rd|th) Extension|5\.0 Conservatory|$)",
+ re.DOTALL
+ )
+
+ # Loop through each building part match, including Main Property and extensions
+ for match in building_part_pattern.finditer(dimensions_text):
+ part_name = match.group(1)
+ floor_data = match.group(2)
+
+ # Pattern to extract floor details: Floor Level, Floor Area, Room Height, Perimeter, Party Wall Length
+ floor_pattern = re.compile(
+ r"(1st Floor|Lowest Floor|Second floor):\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
+ )
+
+ # Extract data for each floor within the building part
+ for floor_match in floor_pattern.finditer(floor_data):
+ floor_level = floor_match.group(1)
+ floor_area = float(floor_match.group(2))
+ room_height = float(floor_match.group(3))
+ perimeter = float(floor_match.group(4))
+ party_wall_length = float(floor_match.group(5))
+
+ # Append to data list
+ data.append({
+ "Building Part": part_name,
+ "Floor Level": floor_level,
+ "Floor Area (m2)": floor_area,
+ "Room Height (m)": room_height,
+ "Perimeter (m)": perimeter,
+ "Party Wall Length (m)": party_wall_length
+ })
+
+ # Check specifically for "Room(s) in Roof" entries, which only have Floor Area
+ room_in_roof_pattern = re.compile(r"Room\(s\) in Roof:\s*([\d.]+)")
+ room_in_roof_match = room_in_roof_pattern.search(floor_data)
+ if room_in_roof_match:
+ floor_area = float(room_in_roof_match.group(1))
+ data.append({
+ "Building Part": part_name,
+ "Floor Level": "Room in Roof",
+ "Floor Area (m2)": floor_area,
+ "Room Height (m)": None, # Placeholder for missing data
+ "Perimeter (m)": None, # Placeholder for missing data
+ "Party Wall Length (m)": None # Placeholder for missing data
+ })
+
+ # Calculate aggregated dimensions
+ main_property = [part for part in data if "Main Property" in part["Building Part"]]
+ first_extensions = [part for part in data if "1st Extension" in part["Building Part"]]
+ dimensions = {
+ "Total Floor Area (m2)": sum([part["Floor Area (m2)"] for part in data]),
+ "Total Ground Floor Area (m2)": sum(
+ [part["Floor Area (m2)"] for part in data if "Lowest Floor" in part["Floor Level"]]
+ ),
+ "RIR Floor Area": sum(
+ [part["Floor Area (m2)"] for part in data if "Room in Roof" in part["Floor Level"]]
+ ),
+ "Main Building Wall Area (m2)": sum([x["Perimeter (m)"] * x["Room Height (m)"] for x in main_property if
+ x["Perimeter (m)"] and x["Room Height (m)"]]),
+ "First Extension Wall Area (m2)": sum(
+ [x["Perimeter (m)"] * x["Room Height (m)"] for x in first_extensions if
+ x["Perimeter (m)"] and x["Room Height (m)"]]
+ ),
+ }
+
+ return dimensions
+
+
+def extract_roof_details_epr(text):
+ """
+ Extracts roof type, insulation, and insulation thickness for each building part
+ in the provided EPR PDF text.
+ """
+ # Define data structure to hold results
+ roof_data = []
+
+ # Locate each building part section
+ building_part_pattern = re.compile(
+ r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)",
+ re.DOTALL
+ )
+
+ # Extract each building part's data, including roof details
+ for match in building_part_pattern.finditer(text):
+ part_name = match.group(1).strip()
+
+ # Clean up the building part name
+ cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip()
+
+ part_details = match.group(2)
+
+ # Extract Roof Type, Roof Insulation, and Roof Insulation Thickness
+ roof_type_match = re.search(r"Roof Type:\s*(.*?)(?=\n|$)", part_details)
+ roof_insulation_match = re.search(r"Roof Insulation:\s*(.*?)(?=\n|$)", part_details)
+ roof_insulation_thickness_match = re.search(r"Roof Insulation Thickness:\s*(.*?)(?=\n|$)", part_details)
+
+ # Store results for this building part
+ roof_data.append({
+ "Building Part": cleaned_part_name,
+ "Roof Type": roof_type_match.group(1).strip() if roof_type_match else None,
+ "Roof Insulation": roof_insulation_match.group(1).strip() if roof_insulation_match else None,
+ "Roof Insulation Thickness": roof_insulation_thickness_match.group(
+ 1).strip() if roof_insulation_thickness_match else None,
+ })
+
+ return roof_data
+
+
+def extract_roof_details_summary(text):
+ """
+ Extracts roof type, insulation, and insulation thickness for each building part
+ in the 8.0 Roofs section of the summary report.
+ """
+ # Define data structure to hold results
+ roof_data = []
+
+ # Locate the entire 8.0 Roofs section
+ roof_section_match = re.search(r"8\.0 Roofs:\n(.*?)(?=\n9\.0 Floors:|$)", text, re.DOTALL)
+ if not roof_section_match:
+ return roof_data # Return empty if no roof section is found
+
+ # Extract the roof section and append "9.0 Floors:" as the boundary
+ roof_section = roof_section_match.group(1).strip() + "\n9.0 Floors:"
+
+ # Define pattern to match each building part's roof entry
+ building_part_pattern = re.compile(
+ r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label
+ r"Type\s+(.*?)(?=\n(?:Insulation|9\.0 Floors:|[A-Z]))" # Matches Roof Type until the next field, label, or end
+ r"(?:\nInsulation\s+(.*?)(?=\n(?:Insulation Thickness|9\.0 Floors:|[A-Z])))?" # Optional Insulation
+ r"(?:\nInsulation Thickness\s+(.*?)(?=\n(?:9\.0 Floors:|[A-Z])))?", # Optional Insulation Thickness
+ re.DOTALL
+ )
+
+ # Extract each building part's data
+ for match in building_part_pattern.finditer(roof_section):
+ part_name = match.group(1).strip() # Building part label
+ roof_type = match.group(2).strip() # Roof Type
+ roof_insulation = match.group(3).strip() if match.group(3) else None # Optional Insulation
+ roof_insulation_thickness = match.group(4).strip() if match.group(4) else None # Optional Thickness
+
+ # Cleaning to handle annoying cases when it comes out like this:
+ # 'A Another dwelling above\n1st Extension'
+ if roof_type.startswith("A Another dwelling above"):
+ roof_type = "A Another dwelling above"
+
+ # Store results for this building part
+ roof_data.append({
+ "Building Part": part_name,
+ "Roof Type": roof_type,
+ "Roof Insulation": roof_insulation,
+ "Roof Insulation Thickness": roof_insulation_thickness,
+ })
+
+ return roof_data
+
+
+def extract_wall_details_epr(text):
+ """
+ Extracts wall type, insulation, dry-lining, and thickness for each building part
+ in the provided EPR PDF text.
+ """
+ # Define data structure to hold results
+ wall_data = []
+
+ # Locate each building part section
+ building_part_pattern = re.compile(
+ r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)",
+ re.DOTALL
+ )
+
+ # Extract each building part's data, including wall details
+ for match in building_part_pattern.finditer(text):
+ part_name = match.group(1).strip()
+
+ # Clean up the building part name
+ cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip()
+
+ part_details = match.group(2)
+
+ # Extract Wall Type, Wall Insulation, Wall Dry-lining, and Wall Thickness
+ wall_type_match = re.search(r"Wall Type:\s*(.*?)(?=\n|$)", part_details)
+ wall_insulation_match = re.search(r"Wall Insulation:\s*(.*?)(?=\n|$)", part_details)
+ wall_drylining_match = re.search(r"Wall Dry-lining:\s*(.*?)(?=\n|$)", part_details)
+ wall_thickness_match = re.search(r"Wall Thickness:\s*(\d+)(?=\n|$)", part_details)
+
+ # Extract Alternative Wall information if available
+ alt_wall_type_match = re.search(r"Alternative Wall Type:\s*(.*?)(?=\n|$)", part_details)
+ alt_wall_insulation_match = re.search(r"Alternative Wall Insulation:\s*(.*?)(?=\n|$)", part_details)
+ alt_wall_drylining_match = re.search(r"Alternative Wall Dry-lining:\s*(.*?)(?=\n|$)", part_details)
+ alt_wall_thickness_match = re.search(r"Alternative Wall Thickness:\s*(\d+)(?=\n|$)", part_details)
+
+ # Store results for this building part
+ wall_data.append({
+ "Building Part": cleaned_part_name,
+ "Wall Type": wall_type_match.group(1).strip() if wall_type_match else None,
+ "Wall Insulation": wall_insulation_match.group(1).strip() if wall_insulation_match else None,
+ "Wall Dry-lining": wall_drylining_match.group(1).strip() if wall_drylining_match else None,
+ "Wall Thickness": int(wall_thickness_match.group(1)) if wall_thickness_match else None,
+ "Alternative Wall Type": alt_wall_type_match.group(1).strip() if alt_wall_type_match else None,
+ "Alternative Wall Insulation": alt_wall_insulation_match.group(
+ 1).strip() if alt_wall_insulation_match else None,
+ "Alternative Wall Dry-lining": alt_wall_drylining_match.group(
+ 1).strip() if alt_wall_drylining_match else None,
+ "Alternative Wall Thickness": int(alt_wall_thickness_match.group(1)) if alt_wall_thickness_match else None,
+ })
+
+ return wall_data
+
+
+def extract_epr(pdf_path):
+ """
+ Extracts specific data from an Energy Report (EPR) PDF file.
+ """
+
+ data = {
+ "Address": None,
+ "Postcode": None,
+ "Current SAP Rating": None,
+ "Current EPC Band": None,
+ "Primary Energy Use (kWh/yr)": None,
+ "Primary Energy Use Intensity (kWh/m2/yr)": None,
+ "Number of Storeys": None,
+ "Main Building Age Band": None,
+ "Fuel Bill": None,
+ "Window Age Description": None,
+ "Window Age Description Proportion (%)": None,
+ "Secondary Window Age Description": None,
+ "Secondary Window Age Description Proportion (%)": None,
+ "Number of Windows": None,
+ "Total Number of Doors": None,
+ "Number of Insulated Doors": None,
+ "Existing Primary Heating System": None,
+ "Existing Primary Heating PCDF Reference": None,
+ "Existing Primary Heating Controls": None,
+ "Existing Primary Heating % of Heat": None,
+ "Existing Secondary Heating System": None,
+ "Existing Secondary Heating PCDF Reference": None,
+ "Existing Secondary Heating Controls": None,
+ "Existing Secondary Heating % of Heat": None,
+ "Secondary Heating Code": None,
+ "Water Heating Code": None,
+ 'Total Floor Area (m2)': None,
+ 'Total Ground Floor Area (m2)': None,
+ 'RIR Floor Area': None,
+ 'Main Building Wall Area (m2)': None,
+ 'First Extension Wall Area (m2)': None,
+ "Number of Light Fittings": None,
+ "Number of LEL Fittings": None,
+ "Number of fittings needing LEL": None,
+ "Main Roof Type": None,
+ "Main Roof Insulation": None,
+ "Main Roof Insulation Thickness": None,
+ "Main Wall Type": None,
+ "Main Wall Insulation": None,
+ "Main Wall Dry-lining": None,
+ "Main Wall Thickness": None,
+ "Main Building Alternative Wall Type": None,
+ "Main Building Alternative Wall Insulation": None,
+ "Main Building Alternative Wall Dry-lining": None,
+ "Main Building Alternative Wall Thickness": None,
+ "Main Fuel": None
+ }
+
+ with open(pdf_path, "rb") as file:
+ reader = PyPDF2.PdfReader(file)
+ text = ""
+ for page in reader.pages:
+ text += page.extract_text()
+
+ # Extract Address
+ address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL)
+ data["Address"] = address_match.group(1).strip()
+ data["Postcode"] = data["Address"].split(",")[-1].strip()
+
+ # Extract Current and Potential SAP ratings
+ sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text)
+ if sap_match is None:
+ # Handles the older format of the elmhurst EPR
+ # The text will look something like this:
+ # Least energy efficient - higher running costsD 61 - we extract D 61
+ sap_match = re.search(
+ r"(?P[A-G])\s(?P\d{1,3})(?P[A-G])\s(?P\d{1,3})",
+ text)
+ data["Current EPC Band"] = sap_match.group("current_epc")
+ data["Current SAP Rating"] = int(sap_match.group("current_sap"))
+ else:
+ current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2))
+ data["Current SAP Rating"] = current_sap
+
+ # Extract the primary energy use intensity
+ additional_rating_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text)
+ if additional_rating_match:
+ data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1))
+ else:
+ # Handles the older format of the Elmhurst EPR
+ primary_energy_match = re.search(r"actual consumption\.\n(?P\d+)", text)
+ data["Primary Energy Use (kWh/yr)"] = int(primary_energy_match.group("primary_energy"))
+ # We calculate the primary energy use intensity by dividing by floor area
+ floor_area = re.search(r"Total Floor Area\s(?P\d+)\s?m2", text).group("floor_area")
+ data["Primary Energy Use Intensity (kWh/m2/yr)"] = data["Primary Energy Use (kWh/yr)"] / int(floor_area)
+
+ # Extract age band
+ age_band_match = re.search(
+ r"Building part:\s*Main\s*-\s*built in\s*(?:[A-Z]\s*)?(\d{4}-\d{4}|before \d{4}|\d{4} onwards)",
+ text
+ )
+
+ data["Main Building Age Band"] = age_band_match.group(1)
+
+ # Extract Number of Storeys
+ storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
+ data["Number of Storeys"] = int(storeys_match.group(1))
+
+ # Extract Fuel Bill
+ fuel_bill_match = re.search(r"TOTAL\s*£(\d+)", text)
+ data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
+
+ # Extract Total Number of Doors
+ total_doors_match = re.search(r"Total Doors:\s*(\d+)", text)
+ data["Total Number of Doors"] = int(total_doors_match.group(1))
+
+ # Extract Number of Insulated Doors
+ insulated_doors_match = re.search(r"Insulated Doors:\s*(\d+)", text)
+ data["Number of Insulated Doors"] = int(insulated_doors_match.group(1))
+
+ # Extract Primary Heating Section (Main Heating 1)
+ primary_heating_section1 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL)
+ # We may not have a secondary heating
+ primary_heating_section2 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Secondary\s*Heating", text, re.DOTALL)
+ primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2
+ primary_text = primary_heating_section.group(1)
+
+ data["Existing Primary Heating System"] = re.search(
+ r"Main Heating Code\s*(.*?)\n", primary_text
+ ).group(1).strip()
+ data["Existing Primary Heating PCDF Reference"] = re.search(
+ r"PCDF boiler Reference\s*(\d+)", primary_text
+ ).group(1)
+ data["Existing Primary Heating Controls"] = re.search(
+ r"Main Heating Controls\s*(.*?)\n", primary_text
+ ).group(1).strip()
+ data["Existing Primary Heating % of Heat"] = int(
+ re.search(r"Percentage of Heat\s*(\d+)\s*%?", primary_text).group(1)
+ )
+
+ # Extract Secondary Heating Section (Main Heating 2)
+ secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL)
+ if secondary_heating_section is None:
+ data["Existing Secondary Heating System"] = ""
+ data["Existing Secondary Heating PCDF Reference"] = ""
+ data["Existing Secondary Heating Controls"] = ""
+ data["Existing Secondary Heating % of Heat"] = 0
+
+ else:
+ secondary_text = secondary_heating_section.group(1)
+
+ main_heating_code_match_secondary = re.search(
+ r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text
+ )
+ data["Existing Secondary Heating System"] = main_heating_code_match_secondary.group(1).strip()
+
+ data["Existing Secondary Heating PCDF Reference"] = re.search(
+ r"PCDF boiler Reference\s*(\d+)", secondary_text
+ ).group(1)
+
+ if data["Existing Secondary Heating System"] == "":
+ data["Existing Secondary Heating Controls"] = ""
+ else:
+ # Might not have heating controls on 2nd system
+ secondary_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text)
+ data["Existing Secondary Heating Controls"] = (
+ secondary_controls_match.group(1).strip() if secondary_controls_match else ""
+ )
+ data["Existing Secondary Heating % of Heat"] = int(
+ re.search(r"Percentage of Heat\s*(\d+)\s*%?", secondary_text).group(1)
+ )
+
+ # Extract Secondary Heating and Water Heating Codes
+ secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text)
+ water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text)
+
+ if data["Existing Secondary Heating System"] == "":
+ data["Secondary Heating Code"] = ""
+ else:
+ data["Secondary Heating Code"] = secondary_heating_code_match.group(
+ 1).strip() if secondary_heating_code_match else ""
+ data["Water Heating Code"] = water_heating_code_match.group(1).strip()
+
+ # Extract Windows information
+ windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
+ if windows_section:
+ windows_text = windows_section.group(1)
+ window_data = extract_window_age_description(windows_text)
+ data.update(window_data)
+
+ building_parts = extract_building_parts_epr(text)
+ data.update(building_parts)
+
+ # Get number of lighting outlets and number of fittings needing LEL
+ lighting_fittings_match = re.search(r"Total number of light fittings\s*(\d+)", text)
+ data["Number of Light Fittings"] = int(lighting_fittings_match.group(1))
+ lel_fittings_match = re.search(r"Total number of L.E.L. fittings\s*(\d+)", text)
+ data["Number of LEL Fittings"] = int(lel_fittings_match.group(1))
+ data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"]
+
+ roof_details = extract_roof_details_epr(text)
+ # Get from the main building
+ main_roof_details = [r for r in roof_details if "Main" in r["Building Part"]]
+ data["Main Roof Type"] = main_roof_details[0]["Roof Type"]
+ data["Main Roof Insulation"] = main_roof_details[0]["Roof Insulation"]
+ data["Main Roof Insulation Thickness"] = main_roof_details[0]["Roof Insulation Thickness"]
+
+ wall_details = extract_wall_details_epr(text)
+ main_wall_details = [w for w in wall_details if "Main" in w["Building Part"]][0]
+ data["Main Wall Type"] = main_wall_details["Wall Type"]
+ data["Main Wall Insulation"] = main_wall_details["Wall Insulation"]
+ data["Main Wall Dry-lining"] = main_wall_details["Wall Dry-lining"]
+ data["Main Wall Thickness"] = main_wall_details["Wall Thickness"]
+ data["Main Building Alternative Wall Type"] = main_wall_details["Alternative Wall Type"]
+ data["Main Building Alternative Wall Insulation"] = main_wall_details["Alternative Wall Insulation"]
+ data["Main Building Alternative Wall Dry-lining"] = main_wall_details["Alternative Wall Dry-lining"]
+ data["Main Building Alternative Wall Thickness"] = main_wall_details["Alternative Wall Thickness"]
+
+ return data
+
+
+def detect_report_type(pdf_path, pdf_file):
+ """
+ Detects the type of report based on content or filename.
+ :param pdf_path: String path to the PDF file
+ :param pdf_file: String name of the PDF file
+ :return: String type of the report ("epr", "summary", or None)
+ """
+ # Attempt to read the first page of the PDF to determine type
+ with open(pdf_path, "rb") as file:
+ # This code raises some warnings like Multiple definitions in dictionary at byte 0x1ab for key /Filter
+ # This is because the pdf is irregular. We could possibly try a library like fitz to handle this
+ reader = PyPDF2.PdfReader(file)
+ first_page_text = reader.pages[0].extract_text() if reader.pages else ""
+ n_pages = len(reader.pages)
+
+ if is_energy_report(first_page_text) and n_pages > 3:
+ # The EPR should have more than 3 pages
+ return "epr"
+ elif is_energy_report(first_page_text) and n_pages <= 3:
+ # This is a shortened version of the EPR which isn't massively useful
+ return "short_form_epr"
+ elif "summary" in pdf_file.lower() or is_summary_report(first_page_text):
+ return "summary"
+ elif is_condition_report(first_page_text):
+ return "condition"
+
+ return None
+
+
+def extract_retrofit_pdfs(data_folder_path):
+ """
+ Handles extraction from a retrofit data folder if it exists and has content.
+ Prioritizes extracting data from an EPR if both EPR and summary report are present.
+ """
+ retrofit_files = [f for f in os.listdir(data_folder_path) if f.endswith(".pdf")]
+ report_types = {"epr": None, "summary": None}
+
+ # First, identify the types of reports available
+ for pdf_file in retrofit_files:
+ pdf_path = os.path.join(data_folder_path, pdf_file)
+ report_type = detect_report_type(pdf_path, pdf_file)
+
+ if report_type == "epr":
+ report_types["epr"] = pdf_path
+ elif report_type == "summary":
+ report_types["summary"] = pdf_path
+
+ # Stop checking further if both EPR and summary are found
+ if report_types["epr"] and report_types["summary"]:
+ break
+
+ # Extract data based on report availability and priority
+ if report_types["epr"]:
+ return extract_epr(report_types["epr"])
+ elif report_types["summary"]:
+ return extract_summary_report(report_types["summary"])
+
+ # If no relevant PDF is found, return None
+ return None
+
+
+def is_energy_report(text):
+ """
+ Determines if the provided text indicates that the PDF is an Energy Report.
+ Returns True if the text contains 'Energy Report'.
+ """
+ return text.startswith("ENERGY REPORT")
+
+
+def is_summary_report(text):
+ """
+ Determines if the provided text indicates that the PDF is a Summary Report.
+ """
+ return text.startswith("Summary Information")
+
+
+def detect_and_parse_report(pdf_path, pdf_file):
+ """
+ Detects the type of report and extracts the relevant data.
+ :param pdf_path: String path to the PDF file
+ :param pdf_file: String name of the PDF file
+ :return:
+ """
+ # Attempt to read the first page of the PDF to determine type
+ with open(pdf_path, "rb") as file:
+ reader = PyPDF2.PdfReader(file)
+ first_page_text = reader.pages[0].extract_text() if reader.pages else ""
+
+ if is_energy_report(first_page_text):
+ # Treat this as an Energy Report
+ return extract_epr(pdf_path)
+ elif "summary" in pdf_file.lower() or is_summary_report(first_page_text):
+ # Treat this as a Summary Report
+ return extract_summary_report(pdf_path)
+ elif is_condition_report(first_page_text):
+ return None
+ else:
+ raise NotImplementedError("Implement me")
+
+
+def is_condition_report(text):
+ """
+ Determines if the provided text indicates that the PDF is a Condition Report.
+ """
+ return text.startswith("OsmosisACDNEWPAS2035ConditionReport") or text.startswith("OsmosisACDPAS2035ConditionReport")
+
+
+def main():
+ """
+ This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater.
+ """
+ # List only directories in the specified FILE_PATH
+ survey_folders = []
+
+ # Loop over each survey folder and list its contents
+ for i in range(1, NUM_FOLDERS + 1):
+ folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}")
+ if os.path.isdir(folder_path): # Check if folder exists
+ folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)]
+ survey_folders.extend(folder_contents) # Append contents to the master list
+
+ # Get rid of .DS_Store files
+ survey_folders = [folder for folder in survey_folders if not folder.endswith(".DS_Store")]
+
+ extracted_data = []
+ for survey_folder in tqdm(survey_folders):
+ survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder)
+
+ # List the folders inside of the survey folder
+ survey_subfolders = [name for name in os.listdir(survey_folder_path)
+ if os.path.isdir(os.path.join(survey_folder_path, name))]
+
+ # Check if there's a "retrofit assessment" folder
+ retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None)
+
+ ra_folder = next(
+ (name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()),
+ None
+ )
+
+ # If retrofit assessment folder exists, check if it has content
+ if retrofit_folder or ra_folder:
+ if retrofit_folder:
+ retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
+ else:
+ retrofit_folder_path = os.path.join(survey_folder_path, ra_folder)
+
+ # Check if everything inside is a sub-folder and the number of folders is 2
+ items = [item for item in os.listdir(retrofit_folder_path) if item != '.DS_Store']
+ all_folders = [os.path.isdir(os.path.join(retrofit_folder_path, item)) for item in items]
+ if all(all_folders) and len(all_folders) == 2 and "Property Pics" in items:
+ # Get the folder that isn't Property Pics
+ retrofit_folder_path = os.path.join(
+ retrofit_folder_path, [item for item in items if item != "Property Pics"][0]
+ )
+
+ if os.listdir(retrofit_folder_path): # If not empty
+ summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path)
+ if summary_data:
+ summary_data = {
+ "survey_folder": survey_folder,
+ **summary_data,
+ }
+ extracted_data.append(summary_data)
+ continue
+ else:
+ # Then we have an empty Retrofit Assessment folder
+ continue
+
+ # If no retrofit folder or it was empty, check files in survey_folder
+
+ summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
+ if not summary_data:
+ if len(survey_subfolders) == 1:
+ survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0])
+ summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
+
+ if summary_data:
+ summary_data = {
+ "survey_folder": survey_folder,
+ **summary_data,
+ }
+ extracted_data.append(summary_data)
+
+ extracted_data = pd.DataFrame(extracted_data)
+
+ extracted_data["Primary Energy Use (kWh/yr)"] = (
+ extracted_data["Primary Energy Use Intensity (kWh/m2/yr)"] * extracted_data["Total Floor Area (m2)"]
+ )
+ extracted_data["Current SAP Rating"] = extracted_data["Current SAP Rating"].astype(int)
+ extracted_data["Current EPC Band"] = extracted_data["Current SAP Rating"].apply(sap_to_epc)
+
+ # Remove some definite duplicates
+ dupes = extracted_data[extracted_data["Address"].duplicated()]["Address"]
+ dupes = extracted_data[extracted_data["Address"].isin(dupes)]
+ dupes = dupes.sort_values("Address")
+ # Get all of the folders that end with ROSS
+ to_drop = dupes[dupes["survey_folder"].str.endswith("ROSS")]["survey_folder"].unique().tolist()
+
+ extracted_data = extracted_data[
+ ~extracted_data["survey_folder"].isin(
+ [
+ "StonewaterSurveys_10/4 Beech Road, LUTON, LU1 1DP ROSS",
+ "StonewaterSurveys_2/135 Runley Road, LUTON, LU1 1TX ROSS",
+ "StonewaterSurveys_13/7 Saxon Road, LUTON, LU3 1JR ROSS"
+ ] + to_drop
+ )
+ ]
+
+ # We now merge on the coordinator data so that against each property, we can map the measures
+ # TODO: Get the pre & post primary energy numbers
+ # TODO: Make sure the numbers are going down
+
+ retrofit_packages_board = pd.read_excel(
+ os.path.join(
+ CUSTOMER_FOLDER_PATH,
+ "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1732034933 Final 19.11.24.xlsx"
+ ),
+ header=4
+ )
+ retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])]
+ # Take just the rows that have been surveyed
+ retrofit_packages_board = retrofit_packages_board[
+ retrofit_packages_board["RA"].isin(["Invoiced", "Completed"])
+ ]
+ # populated_primary_energy = retrofit_packages_board[
+ # ~pd.isnull(retrofit_packages_board['BASE Primary energy (13a-272)'])
+ # ]
+ #
+ # z = populated_primary_energy[
+ # populated_primary_energy['POST Primary energy (13a - 272)'] > populated_primary_energy[
+ # 'BASE Primary energy (13a-272)']
+ # ]
+ #
+ # all(populated_primary_energy['POST Primary energy (13a - 272)'] <= populated_primary_energy[
+ # 'BASE Primary energy (13a-272)'])
+
+ # Replace \n with ""
+ extracted_data["Postcode"] = extracted_data["Postcode"].str.replace("\n", "")
+
+ manual_filters = {
+ "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD",
+ "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG",
+ "2 Bromyard Road": "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ",
+ 'Flat 18, 1 Raglan Court': "StonewaterSurveys_13/60-3-18 Raglan Court, 1 Raglan Court-MK41 8QT",
+ '14 Raglan Court, 1 Devizes Avenue': 'StonewaterSurveys_12/55-3-14 Raglan Court, Devizes Avenue-MK41 8QT',
+ '19 South Road': 'StonewaterSurveys_4/19 The Oaks, South Road, SMETHWICK, B67 7BY',
+ 'Flat 12 Pelican Lane': 'StonewaterSurveys_1/121-3-Flat 12 Lynton Court, Pelican Lane-RG14 1NN',
+ 'Flat C, 44 St Leonards Avenue': 'StonewaterSurveys_11/427-2-44c St. Leonards Avenue-MK42 0RB',
+ '16 The Crescent, Kington': 'StonewaterSurveys_9/360-3-16 The Crescent-HR5 3AS',
+ '2 School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA',
+ '14 South Road': 'StonewaterSurveys_2/14 The Oaks, South Road, SMETHWICK, B67 7BY',
+ '1 Groves Street': 'StonewaterSurveys_4/19-5-1 Groves Street-SN2 2BW',
+ # '2 Sorrell Place': '',
+ # '72 St Ives Road': '',
+ # '1 The Close, Burton Gardens': '',
+ # '102 Cheaton Close': '',
+ # 'Flat 16 Spring Gardens': '',
+ # '4 Apple Close': '',
+ # '25 Folly Lane': '',
+ '2 Calshot Walk': 'StonewaterSurveys_3/156-3-2 Calshot Walk-MK41 8QS',
+ '21 Constitution Hill': 'StonewaterSurveys_1/112-11-21 Constitution Hill-BH14 0PX',
+ '22 Constitution Hill': 'StonewaterSurveys_4/185-8-22 Constitution Hill-BH14 0PX',
+ '2 Marches Cottages, School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA',
+ '26, Copthorn House, Brighton Road': 'StonewaterSurveys_15/133-1-26 Brighton Road-KT20 6BQ',
+ '4, Old St Marys, Ripley Lane': "StonewaterSurveys_15/433-3-4 Ripley Lane-KT24 6JG",
+ '1 Nelson House, Short Street': 'StonewaterSurveys_15/89-2-1 Short Street-GU11 1HX',
+ "18 Nelson House, Short Street": 'StonewaterSurveys_15/25-3- 18 Short Street- GU11 1HX',
+ '3 Nelson House, Short Street': 'StonewaterSurveys_2/138-1-3 Short Street-GU11 1HX',
+ '16, Copthorn House, Brighton Road': 'StonewaterSurveys_13/78-3-16 Brighton Road-KT20 6BQ',
+ '20 Nelson House, Short Street': 'StonewaterSurveys_15/89-1-20 Short Street-GU11 1HX',
+ '7 Croft Street': 'StonewaterSurveys_8/333-2-7 Croft Street-HR6 8LA'
+ }
+
+ # We now match this retrofit packages board to the extracted data
+ matching_lookup = []
+ for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)):
+
+ # Handle the case that has the wrong postcode in the asset data
+ if home["Name"] in manual_filters:
+ filtered = extracted_data[extracted_data["survey_folder"] == manual_filters[home["Name"]]].copy()
+ else:
+ filtered = extracted_data[extracted_data["Postcode"].str.lower() == home["Postcode"].lower()].copy()
+
+ # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
+ to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
+ home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False
+ )
+ if to_filter.sum() == 0:
+ to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.replace(",", "").str.replace(".",
+ "").str.contains(
+ home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False
+ )
+ filtered = filtered[to_filter]
+
+ if filtered.empty:
+ continue
+
+ if filtered.shape[0] == 1:
+ matching_lookup.append(
+ {
+ "survey_folder": filtered["survey_folder"].values[0],
+ "Address ID": home["Address ID"],
+ "Name": home["Name"]
+ }
+ )
+ continue
+
+ # home["Name"] should be contained in the survey_folder
+ filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)]
+ # We have an edge case wher some properties have two outputs in Sharepoint
+ if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ":
+ raise Exception("Fix me1")
+ # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"]
+
+ if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB':
+ raise Exception("Fix me2")
+ # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"]
+
+ if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ':
+ filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"]
+
+ if filtered.empty:
+ continue
+ if filtered.shape[0] != 1:
+ raise Exception("something went wrong")
+
+ matching_lookup.append(
+ {
+ "survey_folder": filtered["survey_folder"].values[0],
+ "Address ID": home["Address ID"],
+ "Name": home["Name"]
+ }
+ )
+
+ matching_lookup = pd.DataFrame(matching_lookup)
+ # Find Osmosis IDs that are in the packages board but not in the matching looking
+ missing_ids = set(retrofit_packages_board["Address ID"]) - set(matching_lookup["Address ID"])
+ missing_ids = list(missing_ids)
+ if missing_ids:
+ # We check that the missing ids have no data yet
+ # missed = retrofit_packages_board[retrofit_packages_board["Address ID"].isin(missing_ids)]
+ # missed[["Name", "Postcode", "Archetype ID", "Arch. Group Rank"]].to_csv(
+ # CUSTOMER_FOLDER_PATH + "/missed_debugging.csv")
+
+ if len(missing_ids) != 1:
+ raise Exception("Unacceptable number of missings")
+
+ if matching_lookup["Address ID"].duplicated().sum():
+ raise Exception("Duplicate Address IDs")
+
+ if matching_lookup["survey_folder"].duplicated().sum():
+ raise Exception("Duplicate survey folders")
+
+ measure_columns = [
+ 'Main Wall Insulation',
+ 'Secondary Wall Insulation',
+ 'Loft insulation',
+ 'Flat Roof',
+ 'Room in Roof',
+ 'Window Upgrade',
+ 'Door Upgrade',
+ 'Ventilation',
+ 'Main Heating',
+ 'Water Heating',
+ 'Heating Controls',
+ 'Solar PV',
+ 'Other measures'
+ ]
+
+ # We should end up with a 1:1 mapping between the Osm. ID and the survey folder
+ stonewater_data = extracted_data.merge(matching_lookup, on="survey_folder", how="inner").merge(
+ retrofit_packages_board[
+ [
+ "Name",
+ "RA",
+ "Address ID",
+ "Archetype ID",
+ "Arch. Group Rank",
+ "Actual SAP Band",
+ "Actual SAP Rating",
+ "Modelled SAP Band",
+ "Modelled SAP Rating",
+ "Package Ref",
+ ] + measure_columns
+ ],
+ on=["Address ID", "Name"],
+ how="left"
+ )
+
+ if stonewater_data["Address ID"].duplicated().sum():
+ raise Exception("Duplicate Address IDs")
+ # Create a section for costs
+ for measure in measure_columns:
+ stonewater_data[f"Cost of {measure}"] = None
+
+ stonewater_data["Total Cost of Measures"] = None
+ stonewater_data["Contingency Cost"] = None
+ stonewater_data["Total Cost of Measures inc Contingency"] = None
+
+ # We've appended the recommended packages and modelled SAP ratings to the data
+ # We also want to append the windows data
+ windows_data = pd.read_excel(
+ os.path.join(
+ CUSTOMER_FOLDER_PATH,
+ "Window data included AP Copy Stonewater SHDF_3_0_Board Triage Master Filtered 26.07.24.xlsx"
+ ),
+ header=12
+ )
+
+ windows_data = windows_data[windows_data["Address ID"] != "Address ID"]
+ windows_data = windows_data[~pd.isnull(windows_data["Address ID"])]
+
+ # We get a lookup id of Osm.ID and when the windows were fitted
+ windows_data = windows_data[
+ ["Address ID", "Window attributes - Fitted/renewed date",
+ "Parent Asset Window attributes - Fitted/renewed date"]
+ ]
+ # Convert to string for the moment
+ windows_data["Parent Asset Window attributes - Fitted/renewed date"] = windows_data[
+ "Parent Asset Window attributes - Fitted/renewed date"
+ ].astype(str)
+ # Create a single date column
+ windows_data["Fitted/renewed date"] = np.where(
+ pd.notnull(windows_data["Window attributes - Fitted/renewed date"]),
+ windows_data["Window attributes - Fitted/renewed date"],
+ windows_data["Parent Asset Window attributes - Fitted/renewed date"]
+ )
+ # Convert to a date
+ windows_data["Fitted/renewed date"] = pd.to_datetime(windows_data["Fitted/renewed date"])
+ # Calculate the number of years since something was done on the windows
+ windows_data["Years since fitted/renewed"] = (pd.Timestamp.now() - windows_data[
+ "Fitted/renewed date"]).dt.days / 365
+
+ stonewater_data["Package Includes Windows"] = ~pd.isnull(stonewater_data["Window Upgrade"])
+ windows_data["Address ID"] = windows_data["Address ID"].astype(float)
+ stonewater_data = stonewater_data.merge(windows_data, on="Address ID", how="left")
+ stonewater_data = stonewater_data.sort_values("Archetype ID", ascending=True)
+
+ if stonewater_data["Address ID"].duplicated().sum():
+ raise Exception("Duplicate Address IDs")
+
+ for c in [
+ 'Window attributes - Fitted/renewed date',
+ 'Parent Asset Window attributes - Fitted/renewed date',
+ 'Fitted/renewed date'
+ ]:
+ stonewater_data[c] = stonewater_data[c].astype(str)
+
+ # FIll the primary energy numbers from the excel
+ stonewater_data = stonewater_data.merge(
+ retrofit_packages_board[
+ [
+ "Name", "Address ID", "BASE Primary energy (13a-272)", "POST Primary energy (13a - 272)"
+ ]
+ ],
+ on=["Address ID", "Name"],
+ how="left"
+ )
+ stonewater_data["Primary Energy Use (kWh/yr)"] = np.where(
+ pd.isnull(stonewater_data["Primary Energy Use (kWh/yr)"]),
+ stonewater_data["BASE Primary energy (13a-272)"],
+ stonewater_data["Primary Energy Use (kWh/yr)"]
+ )
+ stonewater_data = stonewater_data.drop(columns=["BASE Primary energy (13a-272)"])
+
+ # Add on organisation reference
+ original_archetypes = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 "
+ "- Archetyped V3.1.xlsx",
+ header=4
+ )
+ original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])]
+ original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"]
+ original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int)
+
+ stonewater_data = stonewater_data.merge(
+ original_archetypes[["Address ID", 'Org. ref.']],
+ on="Address ID",
+ how="left"
+ )
+
+ # Save this data to excel
+ stonewater_data.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - costed retrofit packages V4.xlsx", index=False)
+
+ cost_sheet = [
+ {
+ "measure": "EWI 0.30 w.m2.K", "cost": 298.35, "unit": "m2"
+ },
+ {
+ "measure": "CWI RdSAP Default", "cost": 14.21, "unit": "m2"
+ },
+ {
+ "measure": "Poss Extract CWI & Refill (issues identified)", "cost": 14.21 + 25, "unit": "m2"
+ },
+ {
+ "measure": "IWI 0.30 w.m2.K", "cost": 244.80, "unit": "m2"
+ },
+ {
+ "measure": "EWI/IWI 0.3", "cost": (298.35 + 244.8) / 2, "unit": "m2"
+ },
+ {
+ "measure": "Loft Insulation 0.11 w.m2.K", "cost": 16.07, "unit": "m2"
+ },
+ {
+ "measure": "Flat Roof 0.11 w.m2.K", "cost": 195, "unit": "m2"
+ },
+ {
+ "measure": "DG Window 1.30 w.m2.K", "cost": 1140, "unit": "each"
+ },
+ {
+ "measure": "Secondary 2.40", "cost": 974, "unit": "each"
+ },
+ {
+ "measure": "Ins. Door 1.30 w.m2.K", "cost": None, "unit": "each"
+ },
+ {
+ "measure": "Ins. Door 1.40 w.m2.K", "cost": None, "unit": "each"
+ },
+ {
+ "measure": "DMEV", "cost": 900, "unit": "each"
+ },
+ {
+ "measure": "ASHP Vaillant 102607 5kw", "cost": None, "unit": "each"
+ },
+ {
+ "measure": "HHRSH Quantum 150", "cost": None, "unit": "each"
+ },
+ {
+ "measure": "Dual Stat Tank 210lt 50mm Foam", "cost": None, "unit": "each"
+ },
+ {
+ "measure": "Dual Stat Tank 160lt 50mm Foam", "cost": None, "unit": "each"
+ },
+ {
+ "measure": "Dual Stat Tank 110lt 50mm Foam", "cost": None, "unit": "each"
+ },
+ {
+ "measure": "Smart Thermostat", "cost": 1200, "unit": "each"
+ },
+ {
+ "measure": "TRV's", "cost": 350, "unit": "each"
+ },
+ {
+ "measure": "Solar PV - 3.0kwp", "cost": 4365.0, "unit": "each"
+ },
+ {
+ "measure": "Solar PV - 1.5kwp", "cost": 3881, "unit": "each"
+ },
+ {
+ "measure": "LEL", "cost": 35, "unit": "per bulb"
+ },
+ {
+ "measure": "Roof 0.16 - Walls 0.30", "cost": 180, "unit": "floor area m2"
+ },
+ {
+ "measure": "Roof 0.16 - Walls 0.16", "cost": 180, "unit": "floor area m2"
+ },
+ ]
+ cost_sheet = pd.DataFrame(cost_sheet)
+
+ # Save cost sheet - ideally this will be used as a secondary sheet for Stonewater
+ cost_sheet.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - cost sheet.xlsx", index=False)
+
+ # stonewater_data[~pd.isnull(stonewater_data["Room in Roof"])]["survey_folder"].values
+
+ create_proposed_wave_3_bid(
+ costed_packages_filepath=os.path.join(
+ CUSTOMER_FOLDER_PATH, "Stonewater - Costed Retrofit Packages 20241030 (WIP) Single Model V3.xlsx"
+ ),
+ archetypes_sheet_filepath=os.path.join(
+ CUSTOMER_FOLDER_PATH, "Stonewater SHDF_3_0_Board Triage 22.05.24 - Archetyped V3.1.xlsx"
+ )
+ )
+
+
+def create_proposed_wave_3_bid(costed_packages_filepath, archetypes_sheet_filepath):
+ # We read in the costed packages
+ costed_packages = pd.read_excel(costed_packages_filepath, header=13, sheet_name="Modelled Packages")
+ costed_packages = costed_packages[~pd.isnull(costed_packages["Address"])]
+
+ archetypes_to_cost = costed_packages[
+ [
+ "Name", "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Modelled SAP Band",
+ "Modelled SAP Rating", "Package Ref", 'Total Cost of Measures', 'Contingency Cost',
+ 'Total Cost of Measures inc Contingency', 'Main Roof Type', 'Main Roof Insulation',
+ 'Main Roof Insulation Thickness', 'Existing Primary Heating System',
+ 'Existing Primary Heating PCDF Reference'
+ ]
+ ].copy()
+
+ # Combine 'Main Roof Type', 'Main Roof Insulation', 'Main Roof Insulation Thickness', separating by colons!
+ archetypes_to_cost['Surveyed Main Roof'] = (
+ archetypes_to_cost['Main Roof Type'] + ': ' + archetypes_to_cost['Main Roof Insulation'] + ': ' +
+ archetypes_to_cost['Main Roof Insulation Thickness'].astype(str)
+ )
+
+ # Combine the heating systems, separating by colons!
+ archetypes_to_cost['Surveyed Main Heating'] = (
+ archetypes_to_cost['Existing Primary Heating System'] + ': code - ' + archetypes_to_cost[
+ 'Existing Primary Heating PCDF Reference'].astype(str)
+ )
+
+ archetypes_to_cost = archetypes_to_cost.drop(
+ columns=['Main Roof Type', 'Main Roof Insulation', 'Main Roof Insulation Thickness',
+ 'Existing Primary Heating System',
+ 'Existing Primary Heating PCDF Reference'])
+
+ # We take properties that are EPC D and below (59% of units)
+ archetypes_to_cost = archetypes_to_cost[archetypes_to_cost["Current EPC Band"].isin(["D", "E", "F", "G"])]
+
+ archetypes_to_cost["Has been modelled"] = ~pd.isnull(archetypes_to_cost["Modelled SAP Band"])
+
+ # These are the Arhetypes that will likely be suitable for Wave 3
+ archetypes_sheet = pd.read_excel(archetypes_sheet_filepath, header=4)
+ archetypes_sheet = archetypes_sheet[~pd.isnull(archetypes_sheet["Address ID"])]
+ archetypes_sheet = archetypes_sheet[archetypes_sheet["Address ID"] != "Address ID"]
+ archetypes_sheet["Address ID"] = archetypes_sheet["Address ID"].astype(int)
+
+ # We merge the property details onto the costed archetypes
+ archetypes_to_cost = archetypes_to_cost.merge(
+ archetypes_sheet[["Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]],
+ on="Address ID",
+ how="left"
+ )
+
+ proposed_sample = archetypes_sheet[
+ archetypes_sheet["Archetype ID"].astype(str).isin(archetypes_to_cost["Archetype ID"].astype(int).astype(str))
+ ]
+
+ not_proposed = archetypes_sheet[
+ ~archetypes_sheet["Archetype ID"].astype(str).isin(archetypes_to_cost["Archetype ID"].astype(int).astype(str))
+ ]
+
+ # archetypes_without_survey = []
+ # for p in list(set(not_proposed)):
+ # filtered = costed_packages[costed_packages["Archetype ID"].astype(int).astype(str) == p]
+ # if filtered.empty:
+ # archetypes_without_survey.append(p)
+
+ # Can we propose anything about archetypes that were not surveyed?
+
+ proposed_sample = proposed_sample[
+ [
+ "Name", "Postcode", "UPRN", "UDPRN", "Address ID", "Osm. ID", "Archetype ID",
+ "Property Type", "Wall Type", "Roof Type", "Heating"
+ ]
+ ]
+
+ # We classify into high and low confidence
+
+ archetypes_to_cost["Surveyed Main Roof"] = archetypes_to_cost["Surveyed Main Roof"].fillna("")
+
+ match_classification = []
+ for _, home in tqdm(proposed_sample.iterrows(), total=len(proposed_sample)):
+
+ surveyed = archetypes_to_cost[archetypes_to_cost["Archetype ID"] == home["Archetype ID"]].copy()
+ surveyed["Package Ref"] = surveyed["Package Ref"].astype(str)
+
+ package = " or ".join(sorted([x for x in surveyed["Package Ref"].unique() if x.strip()]))
+ package = package.replace("\n", "")
+
+ surveyed_roofs = " or ".join(sorted([x for x in surveyed["Surveyed Main Roof"].unique() if x.strip()]))
+ surveyed_roofs = surveyed_roofs.replace("\n", "")
+
+ surveyed_heating = " or ".join(sorted([x for x in surveyed["Surveyed Main Heating"].unique() if x.strip()]))
+ surveyed_heating = surveyed_heating.replace("\n", "")
+
+ # We now check if we have a perfect match
+ surveyed = surveyed[
+ (surveyed["Property Type"] == home["Property Type"]) &
+ (surveyed["Wall Type"] == home["Wall Type"]) &
+ (surveyed["Roof Type"] == home["Roof Type"]) &
+ (surveyed["Heating"] == home["Heating"])
+ ]
+
+ if surveyed.empty:
+ if package == "2B2A":
+ raise Exception("Fix me")
+ match_classification.append(
+ {
+ "Address ID": home["Address ID"],
+ "Match to Surveyed": "Approximate",
+ "Proposed Package Ref": package,
+ "Surveyed Archetype Roofs": surveyed_roofs,
+ "Surveyed Archetype Heating": surveyed_heating
+ }
+ )
+ continue
+ # Re-do
+ package = " or ".join(sorted([x for x in surveyed["Package Ref"].unique() if x.strip()]))
+ package = package.replace("\n", "")
+ surveyed_roofs = " or ".join(sorted([x for x in surveyed["Surveyed Main Roof"].unique() if x.strip()]))
+ surveyed_roofs = surveyed_roofs.replace("\n", "")
+ surveyed_heating = " or ".join(sorted([x for x in surveyed["Surveyed Main Heating"].unique() if x.strip()]))
+ surveyed_heating = surveyed_heating.replace("\n", "")
+
+ match_classification.append(
+ {
+ "Address ID": home["Address ID"],
+ "Match to Surveyed": "Exact",
+ "Proposed Package Ref": package,
+ "Surveyed Archetype Roofs": surveyed_roofs,
+ "Surveyed Archetype Heating": surveyed_heating
+ }
+ )
+
+ match_classification = pd.DataFrame(match_classification)
+
+ proposed_sample = proposed_sample.merge(
+ match_classification,
+ on="Address ID",
+ how="left",
+ )
+
+ # Merge on the cost per archetype
+ cost_per_archetype = (
+ archetypes_to_cost.groupby("Archetype ID")[['Total Cost of Measures inc Contingency']].mean().reset_index()
+ )
+ proposed_sample = proposed_sample.merge(
+ cost_per_archetype,
+ on="Archetype ID",
+ how="left"
+ )
+
+ # We add on a boolean to indicate if a property from that archetype has been modelled
+ proposed_sample = proposed_sample.merge(
+ archetypes_to_cost.groupby("Archetype ID")[["Has been modelled"]].any().reset_index(),
+ on="Archetype ID",
+ how="left"
+ )
+
+ proposed_sample["Total Cost of Measures inc Contingency"] = np.where(
+ ~proposed_sample["Has been modelled"],
+ None, proposed_sample["Total Cost of Measures inc Contingency"]
+ )
+
+ proposed_sample = proposed_sample.sort_values("Archetype ID", ascending=True)
+
+ # Save excel
+ proposed_sample.to_excel(CUSTOMER_FOLDER_PATH + "/Stonewater - Proposed Wave 3 Bid V2 (WIP).xlsx", index=False)
+
+ # For each postcode that's in the bid, we also summarise the number of units in the bid and number left out
+ proposed_sample_postcodes = proposed_sample["Postcode"].unique()
+
+ postcode_summary = []
+ for postcode in proposed_sample_postcodes:
+ in_proposal = proposed_sample[proposed_sample["Postcode"] == postcode]
+ not_in_proposal = not_proposed[not_proposed["Postcode"] == postcode]
+ postcode_summary.append(
+ {
+ "Postcode": postcode,
+ "Number of properties in Proposal": len(in_proposal),
+ "Number of properties not in Proposal": len(not_in_proposal)
+ }
+ )
+ postcode_summary = pd.DataFrame(postcode_summary)
+ postcode_summary = postcode_summary.sort_values(
+ "Number of properties not in Proposal",
+ ascending=False).reset_index(drop=True)
+
+ postcode_summary.to_excel(
+ CUSTOMER_FOLDER_PATH + "/Stonewater - Proposed Wave 3 Bid Postcode Summary.xlsx", index=False
+ )
+
+
+def find_remaining_surveys():
+ """
+ This compares a list of properties that have been surveyed against a list of properties that I have produced
+ costed retrofit packages for, so I know what needs to be downloaded from Sharepoint
+ :return:
+ """
+
+ surveyed = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater"
+ "/Stonewater_SHDF_3_0_Board_work_in_progress_- 07.11.24.xlsx",
+ header=4
+ )
+
+ costed = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater - Costed Retrofit Packages "
+ "20241030 (WIP) MR Review v1.xlsx",
+ header=13,
+ sheet_name="Modelled Packages"
+ )
+ costed = costed[~pd.isnull(costed["Address ID"])]
+
+ needed = surveyed[~surveyed["Address ID"].isin(costed["Address ID"])]
+
+ needed["id"] = needed["Archetype ID"].astype(str) + "-" + needed["Arch. Group Rank"].astype(str)
+ needed = needed.sort_values("id", ascending=True)
+ needed[["id", "Name", "Postcode"]].to_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/needed_surveys.csv"
+ )
+
+ assert needed.shape[0] + costed.shape[0] == surveyed.shape[0]
+
+
+def append_stonewater_id():
+ """
+ This completes an adhoc request from Stonewater to add in their organisation Reference onto the model
+ :return:
+ """
+
+ model_proposed_sample = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater - Bid Packages WIP 13.11.24.xlsx",
+ sheet_name="Modelled Packages",
+ header=13
+ )
+ model_proposed_sample = model_proposed_sample[~pd.isnull(model_proposed_sample["Address ID"])]
+ model_proposed_sample["Address ID"] = model_proposed_sample["Address ID"].astype(int)
+
+ original_archetypes = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 "
+ "- Archetyped V3.1.xlsx",
+ header=4
+ )
+ original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])]
+ original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"]
+ original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int)
+
+ matched = model_proposed_sample.merge(
+ original_archetypes[["Address ID", 'Org. ref.']],
+ on="Address ID",
+ how="left"
+ )
+
+ if pd.isnull(matched["Org. ref."]).sum():
+ raise ValueError("Something went wrong")
+
+ # Save as CSV
+ matched.to_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater IDs.xlsx",
+ sheet_name="Proposed Wave 3 Sample",
+ index=False
+ )
+
+
+def propsed_wave_3_sample():
+ """
+ Stonewater want to ensure that the properties that when selecting properties for wave 3, they choose properties
+ such that most of the properties within a geographical area are treatable within the bid.
+ Name, if we take a geographical area (which could be postal region) they want the most, and ideally all, of the
+ properties within that geographical area to be included within the bid
+ :return:
+ """
+
+ asset_list = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 "
+ "- Archetyped V3.1.xlsx",
+ header=4
+ )
+
+ # TODO: We drop 7 properties missing
+ # UPRN
+ asset_list = asset_list[~asset_list["Archetype ID"].isin(["MISSING UPRN"])]
+ # Clean address ids
+ asset_list = asset_list[~pd.isnull(asset_list["Address ID"])]
+ asset_list = asset_list[asset_list["Address ID"] != "Address ID"]
+ asset_list["Address ID"] = asset_list["Address ID"].astype(int)
+
+ asset_list["Street name"] = np.where(
+ pd.isnull(asset_list["Street name"]),
+ asset_list["Postcode"],
+ asset_list["Street name"]
+ )
+
+ # Create the postal region, taking the first part of the postcode
+ asset_list["Postal Region"] = asset_list["Postcode"].str.split(" ").str[0]
+ asset_list["Street and Region"] = asset_list["Street name"] + " " + asset_list["Postal Region"]
+ unique_postal_regions = asset_list["Postal Region"].unique()
+
+ # Keep just the columns we need
+ asset_list = asset_list[
+ ["UPRN", "Address ID", 'Org. ref.', "Archetype ID", "Postal Region", "Name", "Postcode", "Street and Region",
+ "Property Type", "Wall Type", "Roof Type", "Heating"]
+ ]
+
+ survey_results = pd.read_excel(
+ os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.19 V2.xlsx"),
+ header=13,
+ sheet_name="Modelled Packages"
+ )
+
+ survey_results = survey_results[
+ [
+ "Address ID", "Archetype ID", "Current SAP Rating", "Current EPC Band", "Postcode",
+ "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness",
+ "Existing Primary Heating System",
+ "Package Ref",
+ "Main Wall Type", "Main Wall Insulation Type", "Main Wall Thickness",
+ "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation",
+ "Main Building Alternative Wall Thickness"
+ ]
+ ].rename(
+ columns={
+ "Existing Primary Heating System": "Survey: Primary Heating System"
+ }
+ )
+
+ survey_results["Postal Region"] = survey_results["Postcode"].str.split(" ").str[0]
+ # Concatenate from the wall information
+ survey_results["Survey: Main Wall Type"] = survey_results["Main Wall Type"].astype(str) + ": " + survey_results[
+ "Main Wall Insulation Type"].astype(str)
+ # Alternative wall
+ survey_results["Survey: Main Alternative Wall"] = (
+ survey_results["Main Building Alternative Wall Type"].astype(str) + ": " + survey_results[
+ "Main Building Alternative Wall Insulation"].astype(str)
+ )
+ # Roof information
+ survey_results["Survey: Main Roof Type"] = survey_results["Main Roof Type"].astype(str) + ": " + survey_results[
+ "Main Roof Insulation"].astype(str) + ": " + survey_results["Main Roof Insulation Thickness"].astype(str)
+
+ # Drop the individual columns:
+ survey_results = survey_results.drop(
+ columns=[
+ "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness",
+ "Main Wall Type", "Main Wall Insulation Type",
+ "Main Building Alternative Wall Type", "Main Building Alternative Wall Insulation"
+ ]
+ )
+
+ survey_results_with_original_features = survey_results.merge(
+ asset_list[["UPRN", "Address ID", "Property Type", "Wall Type", "Roof Type", "Heating"]],
+ on="Address ID",
+ how="left"
+ )
+
+ if survey_results_with_original_features.shape[0] != survey_results.shape[0]:
+ raise ValueError("Something went wrong")
+
+ # Against properties that have NO package ref, we assign a package ref
+ properties_with_packages = survey_results_with_original_features[
+ ~pd.isnull(survey_results_with_original_features["Package Ref"])
+ ]
+
+ properties_without_packages = survey_results_with_original_features[
+ (survey_results_with_original_features["Current SAP Rating"] < 69) & pd.isnull(
+ survey_results_with_original_features["Package Ref"]
+ )
+ ]
+
+ # Change this to a lookup
+ package_ratings = pd.DataFrame([
+ {
+ "1A": 1,
+ "1B": 2,
+ "2A": 3,
+ "2B": 4,
+ "3A": 5,
+ "3B": 6,
+ 4: 7
+ }
+ ])
+ package_ratings = pd.melt(package_ratings, var_name="Package Ref", value_name="Rank")
+
+ mapped_package_refs = []
+ for _, property in tqdm(properties_without_packages.iterrows(), total=len(properties_without_packages)):
+ # Same archetype?
+ matches = properties_with_packages[properties_with_packages["Archetype ID"] == property["Archetype ID"]]
+
+ if matches.empty:
+ # Similar property
+ matches = properties_with_packages[
+ (properties_with_packages["Property Type"].str.split(":").str[0] ==
+ property["Property Type"].split(":")[0]) &
+ (properties_with_packages["Wall Type"] == property["Wall Type"]) &
+ (properties_with_packages["Roof Type"].str.split(":").str[0] == property["Roof Type"].split(":")[0]) &
+ (properties_with_packages["Heating"].str.split(":").str[0] == property["Heating"].split(":")[0])
+ ]
+ if matches.empty:
+ matches = properties_with_packages[
+ (properties_with_packages["Property Type"].str.split(":").str[0] ==
+ property["Property Type"].split(":")[0]) &
+ (properties_with_packages["Wall Type"].str.split(":").str[0] == property["Wall Type"].split(":")[0]) &
+ (properties_with_packages["Roof Type"].str.split(":").str[0] == property["Roof Type"].split(":")[0]) &
+ (properties_with_packages["Heating"].str.split(":").str[0] == property["Heating"].split(":")[0])
+ ]
+ if matches.empty:
+ raise Exception("Implement me")
+ if matches.shape[0] > 1:
+ # Take the package with the highest rank
+ matches = matches.merge(
+ package_ratings,
+ on="Package Ref",
+ how="left"
+ ).sort_values("Rank", ascending=False).head(1)
+
+ mapped_package_refs.append(
+ {
+ "Address ID": property["Address ID"],
+ "Matched Package Ref": matches["Package Ref"].values[0]
+ }
+ )
+
+ mapped_package_refs = pd.DataFrame(mapped_package_refs)
+
+ survey_results = survey_results.merge(
+ mapped_package_refs,
+ on="Address ID",
+ how="left"
+ )
+ survey_results["Package Ref"] = np.where(
+ pd.notnull(survey_results["Matched Package Ref"]),
+ survey_results["Matched Package Ref"],
+ survey_results["Package Ref"]
+ )
+ survey_results = survey_results.drop(columns=["Matched Package Ref"])
+
+ # Do the same with survey_results_with_original_features
+ survey_results_with_original_features = survey_results_with_original_features.merge(
+ mapped_package_refs,
+ on="Address ID",
+ how="left"
+ )
+ survey_results_with_original_features["Package Ref"] = np.where(
+ pd.notnull(survey_results_with_original_features["Matched Package Ref"]),
+ survey_results_with_original_features["Matched Package Ref"],
+ survey_results_with_original_features["Package Ref"]
+ )
+ survey_results_with_original_features = survey_results_with_original_features.drop(columns=["Matched Package Ref"])
+
+ # Save the data for reference
+ # mapped_package_refs = mapped_package_refs.merge(
+ # asset_list[["Name", "Postcode", "Address ID", "Org. ref."]],
+ # on="Address ID",
+ # how="left"
+ # )
+ # mapped_package_refs.to_csv(os.path.join(CUSTOMER_FOLDER_PATH, "mapped_package_refs.csv"), index=False)
+
+ # We get longitude & Latitude
+ archetyping_spatial_features = read_pickle_from_s3(
+ bucket_name="retrofit-data-dev", s3_file_name="scustomers/Stonewater/clustering/spatial_data_to_uprn.pkl",
+ )
+ archetyping_spatial_features = pd.concat(archetyping_spatial_features)
+ archetyping_spatial_features = archetyping_spatial_features[["UPRN", 'LATITUDE', 'LONGITUDE']].rename(
+ columns={"LATITUDE": "latitude", "LONGITUDE": "longitude"}
+ )
+ # Merge them onto both datasets
+ asset_list = asset_list.merge(
+ archetyping_spatial_features, how="left", on="UPRN"
+ )
+ if pd.isnull(asset_list["longitude"]).sum():
+ raise ValueError("Something went wrong")
+
+ survey_results_with_original_features = survey_results_with_original_features.merge(
+ archetyping_spatial_features, how="left", on="UPRN"
+ )
+ if pd.isnull(survey_results_with_original_features["longitude"]).sum():
+ raise ValueError("Something went wrong")
+
+ def haversine(lat1, lon1, lat2, lon2):
+ # Radius of Earth in meters
+ R = 6371000
+
+ # Convert degrees to radians
+ lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
+
+ # Differences
+ dlat = lat2 - lat1
+ dlon = lon2 - lon1
+
+ # Haversine formula
+ a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
+ c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
+ distance = R * c
+ return distance
+
+ # Tier definitions
+ # Tier 1: We have a property in the same postal region and same archetype that was surveyed and is below EPC D
+ # Tier 2: We have a property in the same archetype that was surveyed and is below EPC D
+ #
+
+ def match_property_to_surveyed(property, survey_results_with_original_features):
+ surveyed = survey_results_with_original_features[
+ (
+ survey_results_with_original_features["Postal Region"] ==
+ property["Postal Region"]
+ ) &
+ (
+ survey_results_with_original_features["Property Type"] ==
+ property["Property Type"]
+ )
+ &
+ (
+ survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
+ property["Wall Type"].split(":")[0]
+ ) &
+ (
+ survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+ property["Roof Type"].split(":")[0]
+ ) &
+ (
+ survey_results_with_original_features["Heating"].str.split(":").str[0] ==
+ property["Heating"].split(":")[0]
+ )
+ ].copy()
+
+ if not surveyed.empty:
+ return surveyed
+
+ surveyed = survey_results_with_original_features[
+ (
+ survey_results_with_original_features["Postal Region"] ==
+ property["Postal Region"]
+ ) &
+ (
+ survey_results_with_original_features["Property Type"].str.split(":").str[0] ==
+ property["Property Type"].split(":")[0]
+ )
+ &
+ (
+ survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
+ property["Wall Type"].split(":")[0]
+ ) &
+ (
+ survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+ property["Roof Type"].split(":")[0]
+ ) &
+ (
+ survey_results_with_original_features["Heating"].str.split(":").str[0] ==
+ property["Heating"].split(":")[0]
+ )
+ ].copy()
+
+ # surveyed = survey_results_with_original_features[
+ # (
+ # survey_results_with_original_features["Property Type"] ==
+ # property["Property Type"]
+ # ) &
+ # (
+ # survey_results_with_original_features["Wall Type"] ==
+ # property["Wall Type"]
+ # ) &
+ # (
+ # survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+ # property["Roof Type"].split(":")[0]
+ # ) &
+ # (
+ # survey_results_with_original_features["Heating"] ==
+ # property["Heating"]
+ # )
+ # ].copy()
+
+ if not surveyed.empty:
+ return surveyed
+
+ surveyed = survey_results_with_original_features[
+ (
+ survey_results_with_original_features["Property Type"] ==
+ property["Property Type"]
+ ) &
+ (
+ survey_results_with_original_features["Wall Type"] ==
+ property["Wall Type"]
+ ) &
+ (
+ survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+ property["Roof Type"].split(":")[0]
+ ) &
+ (
+ survey_results_with_original_features["Heating"].str.split(":").str[0] ==
+ property["Heating"].split(":")[0]
+ )
+ ].copy()
+
+ return surveyed
+
+ def fill_survey_columns(region_assets, suffix):
+ for col in [
+ 'Current EPC Band', 'Current SAP Rating',
+ 'Survey: Main Wall Type', 'Survey: Main Alternative Wall',
+ 'Survey: Main Roof Type', 'Survey: Primary Heating System',
+ 'Survey: Matching Address ID', 'Distance to Closest Match (m)',
+ "Package Ref"
+ ]:
+ region_assets[col] = np.where(
+ pd.isnull(region_assets[col]) & pd.notnull(region_assets[col + suffix]),
+ region_assets[col + suffix], region_assets[col]
+ )
+ return region_assets
+
+ survey_attribute_columns = [
+ "Survey: Main Wall Type", 'Survey: Main Alternative Wall', 'Survey: Main Roof Type',
+ 'Survey: Primary Heating System'
+ ]
+
+ survey_results["Survey: Matching Address ID"] = survey_results["Address ID"].copy()
+
+ results = []
+ for region in tqdm(unique_postal_regions):
+ # Take all of the properties in that region
+ region_assets = asset_list[asset_list["Postal Region"] == region].copy()
+
+ # We have a tier 1 match if the property itself was surveyed
+ exact_surveyed = survey_results[
+ survey_results["Address ID"].isin(region_assets["Address ID"])
+ ]
+
+ region_assets = region_assets.merge(
+ exact_surveyed[
+ ["Address ID", "Current EPC Band", "Current SAP Rating"] + survey_attribute_columns + [
+ "Survey: Matching Address ID", "Package Ref"
+ ]
+ ],
+ on="Address ID",
+ how="left"
+ )
+ region_assets['Distance to Closest Match (m)'] = None
+ region_assets["Distance to Closest Match (m)"] = np.where(
+ ~pd.isnull(region_assets["Current EPC Band"]),
+ 0,
+ region_assets["Distance to Closest Match (m)"]
+ )
+
+ # Label the tier 1 properties
+ region_assets["Confidence Tier"] = None
+ region_assets["Confidence Tier"] = np.where(
+ region_assets["Current EPC Band"].isin(["D", "E", "F", "G"]),
+ "1 - property was surveyed", region_assets["Confidence Tier"]
+ )
+
+ region_assets["Confidence Tier"] = np.where(
+ region_assets["Current EPC Band"].isin(["C", "B", "A"]),
+ "5 - property was surveyed", region_assets["Confidence Tier"]
+ )
+
+ archetype_ids = region_assets[
+ pd.isnull(region_assets["Confidence Tier"])
+ ]["Archetype ID"].unique()
+ # We get the properties that have been surveyed
+
+ region_surveyed = []
+ for arch_id in archetype_ids:
+ for _, property in region_assets[region_assets["Archetype ID"] == arch_id].iterrows():
+ archetype_data = survey_results_with_original_features[
+ survey_results["Archetype ID"] == arch_id
+ ].copy()
+ if archetype_data.empty:
+ continue
+
+ match_type = "2 - same archetype"
+ if any(archetype_data["Postal Region"] == property["Postal Region"]):
+ match_type = "1 - same archetype, same postal region"
+ archetype_data = archetype_data[
+ archetype_data["Postal Region"] == property["Postal Region"]
+ ]
+
+ if archetype_data.shape[0] > 1:
+ # Look for an exact match, or as close as possible
+ archetype_data_filtered = match_property_to_surveyed(property, archetype_data)
+ if not archetype_data_filtered.empty:
+ archetype_data = archetype_data_filtered
+
+ archetype_data["distance_meters"] = haversine(
+ lat1=property.latitude, lon1=property.longitude,
+ lat2=archetype_data["latitude"].values, lon2=archetype_data["longitude"].values
+ )
+ expected_sap = np.average(
+ archetype_data["Current SAP Rating"], weights=1 / (archetype_data["distance_meters"] + 1)
+ )
+ expected_epc = sap_to_epc(expected_sap)
+
+ archetype_data = archetype_data.sort_values("distance_meters", ascending=True)
+
+ # We take the features of the closest matching property
+ closest_match = archetype_data.iloc[0]
+
+ # Set the package ref
+ if expected_epc in ["C", "B", "A"]:
+ package_ref = None
+ else:
+ package_ref = archetype_data["Package Ref"].dropna().values[0]
+
+ region_surveyed.append(
+ {
+ "Archetype ID": arch_id,
+ "Address ID": property["Address ID"],
+ "Current EPC Band": expected_epc,
+ "Current SAP Rating": expected_sap,
+ 'Survey: Main Wall Type': closest_match["Survey: Main Wall Type"],
+ 'Survey: Main Alternative Wall': closest_match["Survey: Main Alternative Wall"],
+ 'Survey: Main Roof Type': closest_match["Survey: Main Roof Type"],
+ 'Survey: Primary Heating System': closest_match["Survey: Primary Heating System"],
+ "Survey: Matching Address ID": closest_match["Address ID"],
+ 'Distance to Closest Match (m)': closest_match["distance_meters"],
+ "Package Ref": package_ref,
+ "Match Type": match_type
+ }
+ )
+ region_surveyed = pd.DataFrame(region_surveyed)
+
+ if region_surveyed.empty:
+ region_surveyed = pd.DataFrame(
+ columns=[
+ "Archetype ID", "Address ID", "Current EPC Band", "Current SAP Rating",
+ 'Survey: Main Wall Type', 'Survey: Main Alternative Wall', 'Survey: Main Roof Type',
+ 'Survey: Primary Heating System', "Survey: Matching Address ID", 'Distance to Closest Match (m)',
+ "Match Type", "Package Ref"
+ ]
+ )
+
+ starting_shape = region_assets.shape[0]
+ region_assets = region_assets.merge(
+ region_surveyed,
+ on=["Archetype ID", "Address ID"],
+ how="left",
+ suffixes=("", "_method1")
+ )
+ if region_assets.shape[0] != starting_shape:
+ raise ValueError("Something went wrong")
+
+ # Label the tier 1 properties
+ region_assets["Confidence Tier"] = np.where(
+ region_assets["Current EPC Band_method1"].isin(["D", "E", "F", "G"]) &
+ pd.isnull(region_assets["Confidence Tier"]) & ~pd.isnull(region_assets["Match Type"]),
+ region_assets["Match Type"], region_assets["Confidence Tier"]
+ )
+
+ # Handle EPC C
+ region_assets["Confidence Tier"] = np.where(
+ region_assets["Current EPC Band_method1"].isin(["C", "B", "F", "G"]) &
+ pd.isnull(region_assets["Confidence Tier"]),
+ "5 - EPC C or above", region_assets["Confidence Tier"]
+ )
+
+ region_assets = fill_survey_columns(region_assets, suffix="_method1")
+
+ method_1_columns = [c for c in region_assets.columns if c.endswith("_method1")]
+ region_assets = region_assets.drop(columns=method_1_columns + ["Match Type"])
+
+ missed_addressids = region_assets[pd.isnull(region_assets["Confidence Tier"])]["Address ID"].unique().tolist()
+
+ if not missed_addressids:
+ results.append(region_assets)
+ continue
+
+ # This means that this archetype was never surveyed and so we need to find a sufficiently similar property
+ final_missed_matches = []
+ for a_id in missed_addressids:
+
+ match_type = "3 - compared to similar properties"
+
+ property = asset_list[asset_list["Address ID"] == a_id].squeeze()
+
+ surveyed = match_property_to_surveyed(property, survey_results_with_original_features)
+
+ if surveyed.empty:
+ match_type = "3 - compared to similar properties, relaxed"
+ # In this case, we do one additional check where we filter on everything the same apart from heating,
+ # where we do a slightly more rough match
+ surveyed = survey_results_with_original_features[
+ (
+ survey_results_with_original_features["Property Type"].str.split(":").str[0] ==
+ property["Property Type"].split(":")[0]
+ ) &
+ (
+ survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
+ property["Wall Type"].split(":")[0]
+ ) &
+ (
+ survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+ property["Roof Type"].split(":")[0]
+ )
+ ].copy()
+
+ if surveyed.empty:
+ if property["Property Type"].split(":")[0] in ["House", "Bungalow", "Maisonette"]:
+ filter_property_types = ["House", "Bungalow", ]
+ else:
+ filter_property_types = ["Flat"]
+ surveyed = survey_results_with_original_features[
+ (
+ survey_results_with_original_features["Property Type"].str.split(":").str[0].isin(
+ filter_property_types
+ )
+ ) &
+ (
+ survey_results_with_original_features["Wall Type"].str.split(":").str[0] ==
+ property["Wall Type"].split(":")[0]
+ ) &
+ (
+ survey_results_with_original_features["Roof Type"].str.split(":").str[0] ==
+ property["Roof Type"].split(":")[0]
+ )
+ ].copy()
+
+ if "Electric" in property["Heating"]:
+ # Take other electric heating systems
+ surveyed = surveyed[surveyed["Heating"].str.contains("Electric")]
+ elif property["Heating"] in [
+ "Community Heating Systems: Community boilers only (RdSAP)",
+ "Community Heating Systems: Community CHP and boilers (RdSAP)"
+ ]:
+ # Take other community heating systems
+ surveyed = surveyed[surveyed["Heating"].str.contains("Community")]
+ elif property["Heating"] == 'Heat Pump: (from database)':
+ # Take other heat pumps
+ surveyed = surveyed[surveyed["Heating"].str.contains("Heat Pump")]
+ elif property["Heating"] == "Solid fuel room heaters: Open fire in grate":
+ # Take other properties with room heaters
+ surveyed = surveyed[surveyed["Heating"].str.contains("room heaters")]
+ elif "Boiler" in property["Heating"]:
+ # Take other properties with boilers
+ surveyed = surveyed[surveyed["Heating"].str.contains("Boiler")]
+ else:
+ raise Exception("Fix me")
+
+ if surveyed.empty:
+ final_missed_matches.append(
+ {
+ "Address ID": a_id,
+ "Confidence Tier": "4 - no similar property, needs survey to confirm",
+ "Current EPC Band": "Needs Survey",
+ "Current SAP Rating": "Needs Survey",
+ 'Survey: Main Wall Type': "Not Surveyed",
+ "Survey: Main Alternative Wall": "Not Surveyed",
+ "Survey: Main Roof Type": "Not Surveyed",
+ "Survey: Primary Heating System": "Not Surveyed",
+ "Survey: Matching Address ID": "Not Surveyed",
+ 'Distance to Closest Match (m)': 9999999,
+ "Package Ref": "Not Surveyed",
+ }
+ )
+ continue
+
+ # Calculate distance
+ surveyed["distance_meters"] = haversine(
+ lat1=property["latitude"], lon1=property["longitude"],
+ lat2=surveyed["latitude"].values, lon2=surveyed["longitude"].values
+ )
+ surveyed = surveyed.sort_values("distance_meters", ascending=True)
+
+ # Check if we have a postcode match check if surveyed postcode is the same as the property postcode
+ if any(surveyed["Postcode"] == property["Postcode"]):
+ surveyed = surveyed[surveyed["Postcode"] == property["Postcode"]]
+
+ if any(surveyed["Postal Region"] == property["Postal Region"]):
+ surveyed = surveyed[surveyed["Postal Region"] == property["Postal Region"]]
+
+ # Take the 3 nearest
+ surveyed = surveyed.head(3)
+
+ # perform a weighted mean of SAP rating - the closer the better
+ expected_sap = np.average(
+ surveyed["Current SAP Rating"], weights=1 / (surveyed["distance_meters"] + 1)
+ )
+ expected_epc = sap_to_epc(expected_sap)
+
+ if expected_epc in ["C", "B", "A"]:
+ match_type = "5 - EPC C or above"
+
+ closest_match = surveyed.iloc[0]
+
+ # The closest property may be an EPC C, we we take the package ref from the property that's the nearest
+ # with non-NA package ref
+ if expected_epc in ["C", "B", "A"]:
+ package_ref = None
+ else:
+ package_ref = surveyed["Package Ref"].dropna().values[0]
+
+ final_missed_matches.append(
+ {
+ "Address ID": a_id,
+ "Confidence Tier": match_type,
+ "Current EPC Band": expected_epc,
+ "Current SAP Rating": expected_sap,
+ 'Survey: Main Wall Type': closest_match["Survey: Main Wall Type"],
+ "Survey: Main Alternative Wall": closest_match["Survey: Main Alternative Wall"],
+ "Survey: Main Roof Type": closest_match["Survey: Main Roof Type"],
+ "Survey: Primary Heating System": closest_match["Survey: Primary Heating System"],
+ "Survey: Matching Address ID": closest_match["Address ID"],
+ 'Distance to Closest Match (m)': closest_match["distance_meters"],
+ "Package Ref": package_ref
+ }
+ )
+ continue
+
+ final_missed_matches = pd.DataFrame(final_missed_matches)
+
+ region_assets = region_assets.merge(
+ final_missed_matches,
+ on="Address ID",
+ how="left",
+ suffixes=("", "_method3")
+ )
+
+ region_assets["Confidence Tier"] = region_assets["Confidence Tier"].fillna(
+ region_assets["Confidence Tier_method3"]
+ )
+
+ region_assets = fill_survey_columns(region_assets, suffix="_method3")
+
+ method_3_columns = [c for c in region_assets.columns if c.endswith("_method3")]
+ region_assets = region_assets.drop(columns=method_3_columns)
+
+ if pd.isnull(region_assets["Current EPC Band"]).sum():
+ raise Exception("Something went wrong")
+
+ results.append(region_assets)
+
+ results = pd.concat(results)
+
+ if (pd.isnull(results["Package Ref"]) & (~results["Current EPC Band"].isin(["A", "B", "C"]))).sum():
+ raise ValueError("Missing Package Refs")
+
+ # Check if there are missings in current epc band, current sap rating or any of the survey attributes
+ for c in (
+ [
+ "Current EPC Band", "Current SAP Rating", "Survey: Matching Address ID", 'Distance to Closest Match (m)'] +
+ survey_attribute_columns
+ ):
+ if pd.isnull(results[c]).sum():
+ raise Exception("Something went wrong")
+
+ gain_columns = sorted([x for x in results["Confidence Tier"].unique() if "1 - " in x or "2 - " in x or "3 - " in x])
+ loss_columns = sorted([x for x in results["Confidence Tier"].unique() if "4 - " in x or "5 - " in x])
+
+ def optimise(gain, loss, max_loss=250):
+
+ # Define the coefficients for the objective function (negative because we maximize Gain)
+ c = -gain
+
+ # Define constraints
+ A = [loss] # Only 1 constraint for now, total Loss
+ b = [max_loss] # Maximum total Loss allowed
+
+ # Bounds for each variable (select or not select each row, 0 <= x <= 1)
+ bounds = [(0, 1) for _ in gain]
+
+ # Solve the problem using linprog with HiGHS solver
+ result = linprog(c, A_ub=A, b_ub=b, bounds=bounds, method='highs')
+ if not result.success:
+ raise Exception("Optimization failed")
+
+ selected_rows = result.x.round().astype(int) # Rounded to 0 or 1
+ optimal_gain = -result.fun
+
+ return selected_rows, optimal_gain
+
+ street_summary = results.pivot_table(
+ index='Street and Region',
+ columns='Confidence Tier',
+ aggfunc='size',
+ fill_value=0
+ ).reset_index()
+
+ street_summary["Gain"] = street_summary[gain_columns].sum(axis=1)
+ street_summary["Loss"] = street_summary[loss_columns].sum(axis=1)
+
+ selected_rows, _ = optimise(
+ gain=street_summary["Gain"].values,
+ loss=street_summary["Loss"].values,
+ max_loss=250
+ )
+
+ street_summary["Selected"] = selected_rows == 1
+ print(street_summary[street_summary["Selected"]][["Gain", "Loss"]].sum())
+
+ selected_streets = street_summary[
+ street_summary["Selected"]
+ ]
+
+ totals = selected_streets[["Gain", "Loss"]].sum()
+
+ bid_size = totals.sum()
+ print("Bid Size:", bid_size)
+ total_epc_d_or_below = totals["Gain"]
+ print("Total EPC D or below:", total_epc_d_or_below)
+ total_epc_c = totals["Loss"]
+ print("Total EPC C or above:", total_epc_c)
+ # Total needing a survey
+ total_needing_survey = selected_streets[
+ "4 - no similar property, needs survey to confirm"
+ ].sum()
+ print("Total needing survey:", total_needing_survey)
+
+ # Label final outputs
+ # We create a summary of packages by street
+ results["Package Ref"] = results["Package Ref"].fillna("EPC C - No Package")
+ results["Package Ref"] = results["Package Ref"].astype(str)
+ results["Package Ref"] = np.where(
+ results["Package Ref"] == "4.0", "4", results["Package Ref"]
+ )
+ package_summary = results.pivot_table(
+ index='Street and Region',
+ columns='Package Ref',
+ aggfunc='size',
+ fill_value=0
+ ).reset_index()
+
+ assert sum([v for k, v in package_summary.sum().items() if k != "Street and Region"]) == results.shape[0]
+
+ street_bid_structure = street_summary.merge(
+ package_summary, how="left", on="Street and Region"
+ )
+ street_bid_structure = street_bid_structure.sort_values("Gain", ascending=False)
+
+ individual_units_programme = results.copy()
+ individual_units_programme["Unit in Programme"] = individual_units_programme["Street and Region"].isin(
+ street_bid_structure[street_bid_structure["Selected"]]["Street and Region"].values
+ )
+
+ # Merge on Stonewaters ID
+ asset_list_ids = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 "
+ "- Archetyped V3.1.xlsx",
+ header=4
+ )[["Address ID", "Org. ref."]]
+ # Clean address ids
+ asset_list_ids = asset_list_ids[~pd.isnull(asset_list_ids["Address ID"])]
+ asset_list_ids = asset_list_ids[asset_list_ids["Address ID"] != "Address ID"]
+ asset_list_ids["Address ID"] = asset_list_ids["Address ID"].astype(int)
+
+ individual_units_programme = individual_units_programme.merge(
+ asset_list_ids.rename(
+ columns={"Org. ref.": "Survey: Org. ref.", "Address ID": "Survey: Matching Address ID"}
+ ),
+ how="left",
+ on="Survey: Matching Address ID"
+ )
+
+ individual_units_programme["Survey: Org. ref."] = np.where(
+ (individual_units_programme["Survey: Matching Address ID"] == "Not Surveyed"),
+ "Not Surveyed",
+ individual_units_programme["Survey: Org. ref."]
+ )
+
+ if pd.isnull(individual_units_programme["Survey: Org. ref."]).sum() or pd.isnull(
+ individual_units_programme["Org. ref."]).sum():
+ raise ValueError("something went wrong")
+
+ for col in ["Survey: Main Roof Type", "Survey: Main Wall Type", "Survey: Main Alternative Wall"]:
+ individual_units_programme[col] = (
+ individual_units_programme[col]
+ .str.replace(r': nan(?=$|:)', '', regex=True) # Remove ': nan' at the end or before another ':'
+ .str.replace(r':\s+:', ': ', regex=True) # Replace occurrences of ': :' with ': '
+ .str.replace(r'\s+', ' ', regex=True) # Replace multiple spaces with a single space
+ .str.strip() # Strip leading/trailing spaces
+ )
+
+ # Any EPC C properties that have been included should be flagged as potential low carbon heating
+ selected_epc_c = individual_units_programme[
+ (individual_units_programme["Current EPC Band"].isin(["C", "B", "A", "Needs Survey"])) &
+ (individual_units_programme["Unit in Programme"])
+ ]
+
+ flat_wall_map = {
+ "CA Cavity: F Filled Cavity": False,
+ "CA Cavity: A As Built": True,
+ "SO Solid Brick: A As Built": True,
+ "Not Surveyed": False
+ }
+
+ heating_map = {
+ "BGW Post 98 Combi condens. with auto ign.": False,
+ "BGB Post 98 Regular condens. with auto ign.": False,
+ "SEK High heat retention storage heaters": False,
+ "SEB Modern slimline storage heaters": True,
+ "Not Surveyed": False
+ }
+
+ infill_data = []
+ for _, epc_c_property in selected_epc_c.iterrows():
+ if epc_c_property["Property Type"].split(":")[0] == "Flat":
+ # Look for a wall insulation measure
+ infill = flat_wall_map[epc_c_property["Survey: Main Wall Type"]]
+ infill_data.append(
+ {
+ "Address ID": epc_c_property["Address ID"],
+ "Street and Region": epc_c_property["Street and Region"],
+ "Possible Flat Infill?": infill
+ }
+ )
+ continue
+
+ infill = heating_map[epc_c_property["Survey: Primary Heating System"]]
+ infill_data.append(
+ {
+ "Address ID": epc_c_property["Address ID"],
+ "Street and Region": epc_c_property["Street and Region"],
+ "Low Carbon Heating Infill?": infill
+ }
+ )
+ infill_data = pd.DataFrame(infill_data)
+
+ individual_units_programme = individual_units_programme.merge(
+ infill_data[["Address ID", 'Possible Flat Infill?', 'Low Carbon Heating Infill?']],
+ how="left", on="Address ID"
+ )
+
+ for c in ['Possible Flat Infill?', 'Low Carbon Heating Infill?']:
+ individual_units_programme[c] = individual_units_programme[c].fillna(False)
+
+ infill_by_street = infill_data.pivot_table(
+ index='Street and Region',
+ values=['Possible Flat Infill?', 'Low Carbon Heating Infill?'],
+ aggfunc='sum',
+ fill_value=0
+ ).reset_index()
+
+ street_bid_structure = street_bid_structure.merge(
+ infill_by_street, how="left", on="Street and Region"
+ )
+
+ for c in ['Low Carbon Heating Infill?', 'Possible Flat Infill?']:
+ street_bid_structure[c] = street_bid_structure[c].fillna(0)
+
+ master_sheet = pd.read_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - "
+ "master "
+ "sheet.csv",
+ encoding='latin1'
+ )
+ master_sheet = master_sheet[["Address ID", "Main Fuel"]]
+
+ individual_units_programme = individual_units_programme.merge(
+ master_sheet, how="left", on="Address ID"
+ )
+
+ street_bid_structure.to_csv(
+ os.path.join(CUSTOMER_FOLDER_PATH, "Street Bid Structure V2.csv"), index=False
+ )
+
+ individual_units_programme.to_csv(
+ os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv"), index=False
+ )
+
+ survey_results = pd.read_excel(
+ os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.19 V2.xlsx"),
+ header=13,
+ sheet_name="Modelled Packages"
+ )
+
+ indivual_units = pd.read_csv(
+ os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv")
+ )
+
+ u_aids = survey_results["Archetype ID"].astype(str).unique()
+ units_in_bid = indivual_units[indivual_units['Unit in Programme']]["Archetype ID"].astype(str).values
+
+ len({v for v in units_in_bid if str(v) in u_aids})
+ len(list(set(units_in_bid)))
+
+
+def identify_incorrect_packages():
+ """
+ Due to limitations in the data collected during survey, we have some properties that do not have suitable packages
+ assigned. This function will identify those properties, which can be flagged for Stonewater's review
+ """
+
+ units_with_assigned_packages = pd.read_excel(
+ os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.20 V2.xlsx"),
+ header=2,
+ sheet_name="Individual Units Programme"
+ )
+
+ # This sheet contains information on the heating systems for properties, so we can flag any units that have
+ # been labelled as being electric but are actually gas
+ heating_survey_data = pd.read_excel(
+ os.path.join(CUSTOMER_FOLDER_PATH, "STOCKBOOK December 2024 data (5).xlsx"),
+ header=0,
+ sheet_name="Export"
+ )
+
+ units_with_assigned_packages = units_with_assigned_packages.merge(
+ heating_survey_data[["Asset Reference", "Heating Type"]], how="left",
+ left_on="Org. ref.", right_on="Asset Reference"
+ )
+
+ # Check the different heating types
+ units_with_assigned_packages["Gas properties: different to Parity"] = (
+ (
+ units_with_assigned_packages["Heating Type"].isin(["Gas", "Communal Gas"])
+ ) & (
+ units_with_assigned_packages["Heating"].isin(
+ [
+ "Heat Pump: Electric Heat "
+ "pumps: Air source heat pump "
+ "with flow temperature <= 35°C",
+ "Electric Storage Systems: Fan "
+ "storage heaters",
+ "Electric (direct acting) room "
+ "heaters: Panel, convector or "
+ "radiant heaters"
+ ]
+ )
+ )
+ )
+
+ units_with_assigned_packages["Electric properties: different to Parity"] = (
+ (units_with_assigned_packages["Heating Type"] == "Electric") & (
+ units_with_assigned_packages["Heating"].isin(
+ [
+ "Boiler: A rated Regular Boiler",
+ "Boiler: F rated Combi",
+ "No Heating",
+ "Boiler: A rated CPSU",
+ "Boiler: G rated Regular Boiler"
+ ]
+ )
+ )
+ )
+
+ units_with_assigned_packages["Ground Source properties: different to Parity"] = (
+ (units_with_assigned_packages["Heating Type"] == "Ground Source") & (
+ units_with_assigned_packages["Heating"].isin(
+ [
+ "Heat Pump: Electric Heat pumps: Air source heat pump with flow temperature <= 35°C",
+ "Electric Storage Systems: Fan storage heaters",
+ "Electric Storage Systems: High heat retention storage heaters"
+ ]
+ )
+ )
+ )
+
+ units_with_assigned_packages["LPG properties: different to Parity"] = (
+ (units_with_assigned_packages["Heating Type"] == "Lpg") & (
+ units_with_assigned_packages["Main Fuel"].isin(
+ [
+ "Gas: Mains Gas", "Solid Fuel: Wood Logs, Gas: Mains Gas"
+ ]
+ )
+ )
+ )
+
+ units_with_assigned_packages["Solid Fuel properties: different to Parity"] = (
+ (units_with_assigned_packages["Heating Type"] == "Solid Fuel") & (
+ units_with_assigned_packages["Main Fuel"].isin(
+ [
+ "Gas: Mains Gas"
+ ]
+ )
+ )
+ )
+
+ # The next check is to identify properties with specific features that are not condusive to specific packages. E.g.
+ # Solar PV packages for properties that have another dwelling above
+ # Label properties that have been matched to a package, during coordination, that includes Solar PV and has
+ # a property with a dwelling above
+ units_with_assigned_packages["Invalid Roof Type for Solar - coordination to be reviewed"] = (
+ (units_with_assigned_packages["Package Ref"].isin(["3A", "3B", "4", 4])) & (
+ units_with_assigned_packages["Survey: Main Roof Type"].str.contains("A Another dwelling above")
+ )
+ )
+
+ # Label properties that have a dwelling above in the Parity data, and weren't surveyed, but have been assigned
+ # a package that includes solar PV
+ units_with_assigned_packages["Invalid Roof Type for Solar - coordination to be reviewed"] = (
+ (units_with_assigned_packages["Package Ref"].isin(["3A", "3B", "4", 4])) & (
+ units_with_assigned_packages["Survey: Main Roof Type"].str.contains("A Another dwelling above")
+ )
+ )
+
+ # We now iterate through postcodes and find anomalous properties based on the partiy data and survey data
+ fields_to_check = [
+ 'Wall Type Category',
+ # 'Roof Type Category', - not very interesting
+ 'Heating',
+ 'Main Fuel',
+ 'Survey: Main Wall Type',
+ # 'Survey: Main Roof Type',
+ 'Survey: Primary Heating System'
+ ]
+
+ units_with_assigned_packages['Wall Type Category'] = units_with_assigned_packages['Wall Type'].str.replace(
+ r'\s*\(.*?\)', '', regex=True
+ )
+
+ # Create roof type category by splitting in colon and taking the first part
+ units_with_assigned_packages['Roof Type Category'] = units_with_assigned_packages['Roof Type'].str.split(':').str[0]
+
+ units_with_assigned_packages["Street, Region and Postcode"] = (
+ units_with_assigned_packages["Street and Region"] + ", " + units_with_assigned_packages["Postcode"]
+ )
+
+ def check_mixed_types(row):
+ # Count distinct primary types with non-zero values
+ primary_types_present = set()
+ for col in field_counts.columns:
+ if ':' in col:
+ primary_type = col.split(':')[0]
+ if row[col] > 0: # Non-zero count means this type is present
+ primary_types_present.add(primary_type)
+ return len(primary_types_present) > 1 # True if more than one primary type
+
+ aggregated_results = {}
+ for field in fields_to_check:
+ # Group by postcode and count occurrences of each unique value
+ field_counts = (
+ units_with_assigned_packages.groupby(['Street, Region and Postcode', field])
+ .size()
+ .unstack(fill_value=0)
+ .reset_index()
+ )
+
+ # Calculate dominant value and percentage before modifying the DataFrame
+ dominant_value = field_counts.iloc[:, 1:].idxmax(axis=1)
+ dominant_percentage = (
+ (field_counts.iloc[:, 1:].max(axis=1) / field_counts.iloc[:, 1:].sum(axis=1)) * 100
+ )
+ number_of_properties = field_counts.iloc[:, 1:].sum(axis=1)
+
+ # Add these as new columns after computation
+ field_counts['Dominant Value'] = dominant_value
+ field_counts['% Dominant'] = dominant_percentage
+ field_counts['Number of Properties'] = number_of_properties
+ field_counts['Mixed Type'] = field_counts.apply(check_mixed_types, axis=1)
+
+ # Store the result in the dictionary
+ aggregated_results[field] = field_counts
+
+ # Let's fetch the EPC data
+ # Read in the existing EPC data we stored
+ import json
+ from utils.s3 import read_from_s3, read_pickle_from_s3
+ def read_epc_data():
+ epc_data = json.loads(
+ read_from_s3(
+ bucket_name="retrofit-data-dev",
+ s3_file_name="customers/Stonewater/clustering/epc_data.json"
+ )
+ )
+ epc_data = pd.DataFrame(epc_data)
+
+ epc_data["uprn"] = np.where(
+ epc_data["internal_id"] == 1091,
+ 83143766,
+ epc_data["uprn"]
+ )
+ epc_data_batch_2 = read_pickle_from_s3(
+ s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
+ bucket_name="retrofit-data-dev"
+ )
+ epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
+
+ complete_epcs = pd.concat([epc_data, epc_data_batch_2])
+
+ return complete_epcs
+
+ epc_data = read_epc_data()
+ # Get just the fields we want from the EPC: Uprn, Wall, Roof, Heating, Fuel, SAP Score, EPC Band, Date of EPC
+ epc_data_to_append = epc_data[
+ [
+ "uprn", "walls-description", "roof-description", "mainheat-description", "main-fuel",
+ "current-energy-efficiency", "current-energy-rating", "lodgement-date",
+ "estimated"
+ ]
+ ].rename(
+ columns={
+ "uprn": "UPRN",
+ "walls-description": "EPC: Wall Type",
+ "roof-description": "EPC: Roof Type",
+ "mainheat-description": "EPC: Heating",
+ "mainfuel": "EPC: Main Fuel",
+ "current-energy-efficiency": "EPC: SAP Score",
+ "current-energy-rating": "EPC: EPC Band",
+ "lodgement-date": "EPC: Date of EPC",
+ "estimated": "EPC Estimated based on Nearby Properties"
+ }
+ )
+ # Take non-estimated EPCs?
+ # epc_data_to_append = epc_data_to_append[epc_data_to_append["EPC Estimated based on Nearby Properties"] != True]
+ # Take the newest EPC per UPRN, based on lodgement date
+ epc_data_to_append = epc_data_to_append.sort_values("EPC: Date of EPC", ascending=False).drop_duplicates("UPRN")
+
+ epc_data_to_append["EPC: Date of EPC"] = pd.to_datetime(epc_data_to_append["EPC: Date of EPC"])
+ # Years since the EPC was lodged
+ epc_data_to_append["Years since EPC"] = (pd.Timestamp.now() - epc_data_to_append["EPC: Date of EPC"]).dt.days / 365
+ epc_data_to_append = epc_data_to_append[epc_data_to_append["UPRN"] != ""]
+ epc_data_to_append["UPRN"] = epc_data_to_append["UPRN"].astype(int)
+
+ units_with_assigned_packages = units_with_assigned_packages.merge(
+ epc_data_to_append, how="left", on="UPRN",
+ )
+
+ # Read in the wave 2.1 data
+ wave_2_data = pd.read_excel(
+ os.path.join(
+ CUSTOMER_FOLDER_PATH, "Stonewater 2.1 SAP Pre & Post.xlsx"
+ ),
+ header=3
+ )
+ # Remove any where the work is outstanding
+ wave_2_data = wave_2_data[wave_2_data["Retrofit Assessment"] == "Completed"]
+ wave_2_data = wave_2_data[~pd.isnull(wave_2_data["Package Approved (Client)"])]
+ wave_2_data["house_number"] = wave_2_data["Name"].apply(lambda x: SearchEpc.get_house_number(x, ""))
+
+ # Filter postcodes in the units_with_assigned_packages, to find overlapping postcodes
+ related_to_wave_2 = units_with_assigned_packages[
+ units_with_assigned_packages["Postcode"].isin(
+ wave_2_data["Post Code"].values
+ ) & (
+ ~units_with_assigned_packages["Confidence Tier"].isin(
+ [
+ "1 - same archetype, same postal region", "1 - property was surveyed"
+ ]
+ )
+ )
+ ]
+
+ wave2_matches = []
+ for _, home in related_to_wave_2.iterrows():
+ # Get the related homes
+ assigned_wave_2_packages = wave_2_data[
+ wave_2_data["Post Code"] == home["Postcode"]
+ ]
+
+ if assigned_wave_2_packages.shape[0] != 1:
+ # In this case, we get the closest match based on door number
+ hn = SearchEpc.get_house_number(home["Name"], home["Postcode"])
+
+ assigned_wave_2_packages = assigned_wave_2_packages[
+ abs(assigned_wave_2_packages["house_number"].astype(int) - int(hn)) == min(
+ abs(assigned_wave_2_packages["house_number"].astype(int) - int(hn)))
+ ]
+
+ wave2_matches.append(
+ {
+ "UPRN": home["UPRN"],
+ "2.1 matched address": assigned_wave_2_packages["Name"].values[0],
+ "2.1 matched address: Package Ref": assigned_wave_2_packages["Package Approved (Client)"].values[0],
+ "2.1 matched address: Wall Insulation": assigned_wave_2_packages["Wall Insulation"].values[0],
+ "2.1 matched address: Loft Insulation": assigned_wave_2_packages["Loft Insulation"].values[0],
+ "2.1 matched address: Ventilation": assigned_wave_2_packages["Ventilation"].values[0],
+ "2.1 matched address: Windows": assigned_wave_2_packages["Windwos Upgrade"].values[0]
+ }
+ )
+
+ # Store each results to CSV
+ for field, df in aggregated_results.items():
+ df.to_csv(
+ os.path.join(CUSTOMER_FOLDER_PATH, f"{field} - aggregated results.csv"), index=False
+ )
+
+ # Store units_with_assigned_packages
+ units_with_assigned_packages.to_csv(
+ os.path.join(CUSTOMER_FOLDER_PATH, "Units with assigned packages - with flags.csv"), index=False
+ )
+
+
+def extract_sharepoint_url(x):
+ if pd.isnull(x):
+ return ""
+ return "/".join(parse.urlparse(
+ x.split(" - http")[1]
+ ).path.replace("%20", " ").split("/")[-2:])
+
+
+def revised_model():
+ """
+ This function implements the revised model for Stonewater, where we are looking at new priority postcodes
+ This work was undertaken in January 2021.
+ """
+
+ # 1) Create the new list of properties
+ new_priority_postcodes = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Jan 2025 Project/Updated 2025 to 2030 "
+ "priority list.xlsx"
+ )
+
+ original_archetypes = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 "
+ "- Archetyped V3.1.xlsx",
+ header=4
+ )
+ original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])]
+ original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"]
+ original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int)
+ original_archetypes["UPRN"] = original_archetypes["UPRN"].astype("Int64").astype(str)
+
+ wave_21_folder_name = "Wave 2.1 Surveys - 2"
+
+ # Check if we have all of the addresses
+ missed = original_archetypes[
+ ~original_archetypes["Address ID"].isin(new_priority_postcodes["Address ID"].values)
+ ]["Archetype ID"].unique()
+
+ assert set(missed) == {'NOT PRIORITY POSTCODE', 'IN WAVE 2.1', 'EPC C OR ABOVE'}
+
+ original_archetypes = original_archetypes[
+ ["Address ID", "Archetype ID", "Archetype Group Rank", "UPRN"]
+ ]
+
+ # Merge these archetypes on to the new priority postcodes
+ new_priority_postcodes = new_priority_postcodes.merge(
+ original_archetypes, how="left", on="Address ID"
+ )
+
+ # Basic check, should have no rows with missing Archetype ID, where
+ assert float(new_priority_postcodes[pd.isnull(new_priority_postcodes["Archetype ID"])]["Address ID"].isin(
+ original_archetypes["Address ID"]
+ ).sum()) == 0
+
+ # We pull together the survey data sheet
+ survey_folders = []
+
+ # Loop over each survey folder and list its contents
+ for i in range(1, NUM_FOLDERS + 1):
+ folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}")
+ if os.path.isdir(folder_path): # Check if folder exists
+ folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)]
+ survey_folders.extend(folder_contents) # Append contents to the master list
+
+ wave_21_folders = [
+ "1. Herefordshire",
+ "2. Bedfordshire",
+ "3. Wiltshire",
+ "4. Bournemouth",
+ "5. Coventry",
+ "6. West Sussex",
+ "7. Dorset",
+ "8. Cambridgeshire",
+ "9. Guildford",
+ "10. Little Island",
+ "11. CCS Dorset"
+ ]
+
+ for wave_2_1_folder in wave_21_folders:
+ folder_path = os.path.join(CUSTOMER_FOLDER_PATH, wave_21_folder_name, wave_2_1_folder)
+ if os.path.isdir(folder_path): # Check if folder exists
+ folder_contents = [os.path.join(wave_21_folder_name, wave_2_1_folder, file) for file in
+ os.listdir(folder_path)]
+ survey_folders.extend(folder_contents) # Append contents to the master list
+
+ # We now do a large pull of all of the data
+ extracted_data = []
+ mtp_extracted_data = [] # Additional data to extract from the medium term plans
+ for survey_folder in tqdm(survey_folders):
+ survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder)
+
+ # Check that the survey folder is actually a folder
+ if not os.path.isdir(survey_folder_path):
+ continue
+
+ # List the folders inside of the survey folder
+ survey_subfolders = [
+ name for name in os.listdir(survey_folder_path)
+ if os.path.isdir(os.path.join(survey_folder_path, name))
+ ]
+
+ # Check if there's a "retrofit assessment" folder
+ retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None)
+
+ ra_folder = next(
+ (name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()),
+ None
+ )
+
+ mtp_folder = next(
+ (name for name in survey_subfolders if "mid-term" in name.lower() or "mtp" in name.lower()),
+ None
+ )
+ if mtp_folder:
+ # We have a mid term plan:
+ mtp_folder_path = os.path.join(survey_folder_path, mtp_folder)
+ # Get the contents - files and not folder
+ mtp_contents = [
+ os.path.join(mtp_folder, file) for file in os.listdir(mtp_folder_path)
+ if ".DS_Store" not in file and not os.path.isdir(os.path.join(mtp_folder_path, mtp_folder, file))
+ ]
+
+ has_v1 = [
+ f for f in mtp_contents if "v1" in f.lower() or "/ss" in f.lower()
+ ]
+
+ if has_v1:
+ # Then we go one level deeper
+ mtp_contents = [
+ os.path.join(has_v1[0], f) for f in
+ os.listdir(os.path.join(survey_folder_path, has_v1[0]))
+ ]
+
+ # We check the the IMA
+ for file_name in mtp_contents:
+
+ filepath = os.path.join(survey_folder_path, file_name)
+ # We expect a pdf so try and parse it
+ try:
+ with open(filepath, "rb") as file:
+ reader = PyPDF2.PdfReader(file)
+ # Just the first page
+ text = reader.pages[0].extract_text()
+
+ except Exception as e:
+ continue
+
+ # We check if this is an IMA
+ ima_heading_search = re.search(
+ r"Improvement measure\s+Capital Cost\s+Lifetime of\s*\n\s*measureFuel saving\s*Lifetime fuel", text
+ )
+
+ is_ima = bool(ima_heading_search)
+ if not is_ima:
+ continue
+
+ # Otherwise, extract: RIR, PV
+ pv_search = re.search(r"PV \(\d+Kwp\)", text)
+ has_pv = bool(pv_search)
+ pv_system = pv_search.group(0) if has_pv else None
+
+ # We perform a second search for PV:
+ if pv_search is None:
+ pv_search = re.search("solar pv", text.lower())
+ has_pv = bool(pv_search)
+ pv_system = "Solar PV" if has_pv else None
+
+ rir_search = re.search(r"RIR \(\d+(\.\d+)?\)", text)
+ has_rir = bool(rir_search)
+ rir_spec = rir_search.group(0) if has_rir else None
+
+ mtp_extracted_data.append({
+ "survey_folder": survey_folder,
+ "has_pv": has_pv,
+ "PV System": pv_system,
+ "RIR Specification": rir_spec,
+ "has_rir": has_rir
+ })
+ continue
+
+ # If retrofit assessment folder exists, check if it has content
+ if retrofit_folder or ra_folder:
+ if retrofit_folder:
+ retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
+ else:
+ retrofit_folder_path = os.path.join(survey_folder_path, ra_folder)
+
+ # Check if everything inside is a sub-folder and the number of folders is 2
+ items = [item for item in os.listdir(retrofit_folder_path) if item != '.DS_Store']
+ all_folders = [os.path.isdir(os.path.join(retrofit_folder_path, item)) for item in items]
+ if all(all_folders) and len(all_folders) == 2 and "Property Pics" in items:
+ # Get the folder that isn't Property Pics
+ retrofit_folder_path = os.path.join(
+ retrofit_folder_path, [item for item in items if item != "Property Pics"][0]
+ )
+
+ if os.listdir(retrofit_folder_path): # If not empty
+ summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path)
+ if summary_data:
+ summary_data = {
+ "survey_folder": survey_folder,
+ **summary_data,
+ }
+ extracted_data.append(summary_data)
+ continue
+ else:
+ # Then we have an empty Retrofit Assessment folder
+ continue
+
+ # If no retrofit folder or it was empty, check files in survey_folder
+ summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
+ if not summary_data:
+ if len(survey_subfolders) == 1:
+ survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0])
+ summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
+
+ if summary_data:
+ summary_data = {
+ "survey_folder": survey_folder,
+ **summary_data,
+ }
+ extracted_data.append(summary_data)
+
+ retrofit_assessment_data = pd.DataFrame(extracted_data)
+ mtp_df = pd.DataFrame(mtp_extracted_data)
+
+ # Save
+ # retrofit_assessment_data.to_csv(
+ # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 5.csv"), index=False
+ # )
+ # mtp_df.to_csv(
+ # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 5.csv"), index=False
+ # )
+ retrofit_assessment_data = pd.read_csv(
+ os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 5.csv"),
+ )
+ mtp_df = pd.read_csv(
+ os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 5.csv"),
+ )
+
+ # There are a few duplicates we just manually drop
+ mtp_df = mtp_df.drop_duplicates()
+ mtp_df = mtp_df[
+ ~((
+ mtp_df["survey_folder"] == "Wave 2.1 Surveys - 2/1. Herefordshire/(043) Manor Fields 27"
+ ) & (~mtp_df["has_pv"]))
+ ]
+
+ mtp_df = mtp_df[
+ ~((
+ mtp_df["survey_folder"] == "Wave 2.1 Surveys - 2/2. Bedfordshire/(147) Gilpin Close 5"
+ ) & (~mtp_df["has_pv"]))
+ ]
+
+ # Remove some definite duplicates
+ dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].duplicated()]["Address"]
+ dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].isin(dupes)]
+ dupes = dupes.sort_values("Address")
+ # Get all of the folders that end with ROSS
+ to_drop = dupes[dupes["survey_folder"].str.endswith("ROSS")]["survey_folder"].unique().tolist()
+
+ # Replace \n with ""
+ retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "")
+
+ retrofit_assessment_data = retrofit_assessment_data[
+ ~retrofit_assessment_data["survey_folder"].isin(
+ [
+ "StonewaterSurveys_10/4 Beech Road, LUTON, LU1 1DP ROSS",
+ "StonewaterSurveys_2/135 Runley Road, LUTON, LU1 1TX ROSS",
+ "StonewaterSurveys_13/7 Saxon Road, LUTON, LU3 1JR ROSS"
+ ] + to_drop
+ )
+ ]
+
+ retrofit_assessments_data_columns = [
+ 'Current SAP Rating', 'Current EPC Band', 'Primary Energy Use (kWh/yr)',
+ 'Primary Energy Use Intensity (kWh/m2/yr)', 'Number of Storeys',
+ 'Fuel Bill', 'Window Age Description',
+ 'Window Age Description Proportion (%)',
+ 'Secondary Window Age Description',
+ 'Secondary Window Age Description Proportion (%)', 'Number of Windows',
+ 'Total Number of Doors', 'Number of Insulated Doors',
+ 'Existing Primary Heating System',
+ 'Existing Primary Heating PCDF Reference',
+ 'Existing Primary Heating Controls',
+ 'Existing Primary Heating % of Heat',
+ 'Existing Secondary Heating System',
+ 'Existing Secondary Heating PCDF Reference',
+ 'Existing Secondary Heating Controls',
+ 'Existing Secondary Heating % of Heat', 'Secondary Heating Code',
+ 'Water Heating Code', 'Total Floor Area (m2)',
+ 'Total Ground Floor Area (m2)', 'RIR Floor Area',
+ 'Main Building Wall Area (m2)', 'First Extension Wall Area (m2)',
+ 'Number of Light Fittings', 'Number of LEL Fittings',
+ 'Number of fittings needing LEL', 'Main Roof Type',
+ 'Main Roof Insulation', 'Main Roof Insulation Thickness',
+ 'Main Wall Type', 'Main Wall Insulation', 'Main Wall Dry-lining',
+ 'Main Wall Thickness', 'Main Building Alternative Wall Type',
+ 'Main Building Alternative Wall Insulation',
+ 'Main Building Alternative Wall Dry-lining',
+ 'Main Building Alternative Wall Thickness',
+ 'Main Fuel',
+ 'Main Building Age Band',
+ ]
+ # For the columns in retrofit_assessments_data_columns, prefix all of them with Survey:
+ retrofit_assessments_data_columns_prefixed = ["Survey: " + x for x in retrofit_assessments_data_columns]
+ rename_dict = dict(zip(retrofit_assessments_data_columns, retrofit_assessments_data_columns_prefixed))
+ retrofit_assessment_data = retrofit_assessment_data.rename(columns=rename_dict)
+ retrofit_assessment_data["Survey: Current EPC Band"] = (
+ retrofit_assessment_data["Survey: Current SAP Rating"].apply(lambda x: sap_to_epc(x))
+ )
+
+ # We can read in the data as needed
+
+ # Next Step: Read in the coordinated measures and match to the extracted data
+ ############################################################
+ # CCS
+ #############################################################
+ ccs_coordination_sheet = pd.read_excel(
+ os.path.join(
+ CUSTOMER_FOLDER_PATH,
+ "Jan 2025 Project",
+ "CCS_Installation_Compliance_-_Stonewater_SHDF_2_1_1738228227.xlsx"
+ ),
+ header=4
+ )
+ ccs_postcodes = pd.read_excel(
+ os.path.join(
+ CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "CCS_Installation_Compliance_CCS.xlsx"
+ ),
+ header=4
+ )
+ ccs_coordination_sheet = ccs_postcodes[['Name', 'Post Code', 'Asset ID', 'Asset ID.1']].merge(
+ ccs_coordination_sheet, how="left", on="Name"
+ )
+ ccs_coordination_sheet = ccs_coordination_sheet[~pd.isnull(ccs_coordination_sheet["Name"])]
+ ccs_coordination_sheet["contractor"] = "CCS"
+ # We split ccs into two sections - the first being
+ ccs_coordination_removed_from_programme = ccs_coordination_sheet.tail(21)
+ ccs_coordination_sheet = ccs_coordination_sheet.head(87)
+ ccs_coordination = pd.concat([ccs_coordination_removed_from_programme, ccs_coordination_sheet])
+
+ ccs_coordination["folder_path"] = ccs_coordination["Sharepoint Link"].apply(lambda x: extract_sharepoint_url(x))
+
+ ############################################################
+ # WATES
+ #############################################################
+ wates_coordination_sheet = pd.read_excel(
+ os.path.join(
+ CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "Stonewater_SAP_Installation_Compliance_1738229226.xlsx"
+ ),
+ header=4
+ )
+ wates_postcodes = pd.read_excel(
+ os.path.join(
+ CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "Stonewater_SAP_Installation_Compliance_Vinci-Wates.xlsx"
+ ),
+ header=4
+ )
+ wates_postcodes = wates_postcodes[~pd.isnull(wates_postcodes["Post Code"])]
+ wates_coordination_sheet = wates_coordination_sheet.merge(
+ wates_postcodes[['Name', 'Post Code', 'Asset ID']].drop_duplicates(), how="left", on="Name"
+ )
+
+ wates_coordination_sheet["contractor"] = "Wates"
+ # Break into the different sites:
+ # Wiltshire
+ wates_coordination_sheet_wiltshere = wates_coordination_sheet.head(267)
+ wates_coordination_sheet_herefordshire = wates_coordination_sheet.iloc[271:332, :]
+ wates_coordination_sheet_coventry = wates_coordination_sheet.iloc[336:409, :]
+ wates_coordination_sheet_bedfordshire = wates_coordination_sheet.iloc[413:520, :]
+ wates_coordination_sheet_bournemouth = wates_coordination_sheet.iloc[524:567, :]
+ wates_coordination_sheet_cambridgeshire = wates_coordination_sheet.iloc[571:581, :]
+ wates_coordination_sheet_removed_from_programme = wates_coordination_sheet.iloc[586:926, :]
+ wates_coordination_sheet_abeyance = wates_coordination_sheet.iloc[930:972, :]
+
+ wates_coordination = pd.concat(
+ [
+ wates_coordination_sheet_wiltshere,
+ wates_coordination_sheet_herefordshire,
+ wates_coordination_sheet_coventry,
+ wates_coordination_sheet_bedfordshire,
+ wates_coordination_sheet_bournemouth,
+ wates_coordination_sheet_cambridgeshire,
+ wates_coordination_sheet_removed_from_programme,
+ wates_coordination_sheet_abeyance
+ ]
+ )
+ # We correct the Asset ID for 34 Kempster Close
+ wates_coordination["Asset ID"] = np.where(
+ wates_coordination["Name"] == "34 Kempster Close",
+ "12005",
+ wates_coordination["Asset ID"]
+ )
+
+ # We fill the missing ids
+ missing_lookup = {
+ "4 Sydnall Fields": 31231,
+ "12 Sydnall Fields": 31239,
+ "12 Athena Gardens": 28061,
+ "49 Banner Lane": 41189,
+ "4 Jonathan Road": 41232,
+ "8 Jonathan Road": 41236,
+ "1 Jonathan Road": 41229,
+ "96 Taunton Way": 31417,
+ "94 Taunton Way": 31418,
+ "1 Lady Lane": 29430,
+ "10 Jonathan Road": 41283,
+ "21 Jonathan Road": 41246,
+ "12 Ashcroft Close": 26399
+ }
+ for name, asset_id in missing_lookup.items():
+ wates_coordination["Asset ID"] = np.where(
+ wates_coordination["Name"] == name,
+ asset_id,
+ wates_coordination["Asset ID"]
+ )
+
+ wates_coordination = wates_coordination[~pd.isnull(wates_coordination["Asset ID"])]
+
+ wates_coordination["folder_path"] = wates_coordination["Sharepoint Folder"].apply(
+ lambda x: extract_sharepoint_url(x)
+ )
+
+ ############################################################
+ # NEW 450 COORDINATED RETROFIT ASSESSMENTS
+ #############################################################
+ features = pd.read_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - "
+ "master sheet.csv",
+ encoding='latin1'
+ )
+ features["Address ID"] = features["Address ID"].astype(str).astype(int)
+ features_to_merge = features[["Address ID", "Organisation Reference"]]
+
+ retrofit_packages_board = pd.read_excel(
+ os.path.join(
+ CUSTOMER_FOLDER_PATH,
+ "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1732034933 Final 19.11.24.xlsx"
+ ),
+ header=4
+ )
+ retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])]
+ # Take just the rows that have been surveyed
+ retrofit_packages_board = retrofit_packages_board[
+ retrofit_packages_board["RA"].isin(["Invoiced", "Completed"])
+ ]
+
+ retrofit_packages_board = retrofit_packages_board.merge(
+ features_to_merge, how="left", on="Address ID"
+ )
+
+ manual_filters = {
+ "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD",
+ "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG",
+ "2 Bromyard Road": "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ",
+ 'Flat 18, 1 Raglan Court': "StonewaterSurveys_13/60-3-18 Raglan Court, 1 Raglan Court-MK41 8QT",
+ '14 Raglan Court, 1 Devizes Avenue': 'StonewaterSurveys_12/55-3-14 Raglan Court, Devizes Avenue-MK41 8QT',
+ '19 South Road': 'StonewaterSurveys_4/19 The Oaks, South Road, SMETHWICK, B67 7BY',
+ 'Flat 12 Pelican Lane': 'StonewaterSurveys_1/121-3-Flat 12 Lynton Court, Pelican Lane-RG14 1NN',
+ 'Flat C, 44 St Leonards Avenue': 'StonewaterSurveys_11/427-2-44c St. Leonards Avenue-MK42 0RB',
+ '16 The Crescent, Kington': 'StonewaterSurveys_9/360-3-16 The Crescent-HR5 3AS',
+ '2 School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA',
+ '14 South Road': 'StonewaterSurveys_2/14 The Oaks, South Road, SMETHWICK, B67 7BY',
+ '1 Groves Street': 'StonewaterSurveys_4/19-5-1 Groves Street-SN2 2BW',
+ '2 Calshot Walk': 'StonewaterSurveys_3/156-3-2 Calshot Walk-MK41 8QS',
+ '21 Constitution Hill': 'StonewaterSurveys_1/112-11-21 Constitution Hill-BH14 0PX',
+ '22 Constitution Hill': 'StonewaterSurveys_4/185-8-22 Constitution Hill-BH14 0PX',
+ '2 Marches Cottages, School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA',
+ '26, Copthorn House, Brighton Road': 'StonewaterSurveys_15/133-1-26 Brighton Road-KT20 6BQ',
+ '4, Old St Marys, Ripley Lane': "StonewaterSurveys_15/433-3-4 Ripley Lane-KT24 6JG",
+ '1 Nelson House, Short Street': 'StonewaterSurveys_15/89-2-1 Short Street-GU11 1HX',
+ "18 Nelson House, Short Street": 'StonewaterSurveys_15/25-3- 18 Short Street- GU11 1HX',
+ '3 Nelson House, Short Street': 'StonewaterSurveys_2/138-1-3 Short Street-GU11 1HX',
+ '16, Copthorn House, Brighton Road': 'StonewaterSurveys_13/78-3-16 Brighton Road-KT20 6BQ',
+ '20 Nelson House, Short Street': 'StonewaterSurveys_15/89-1-20 Short Street-GU11 1HX',
+ '7 Croft Street': 'StonewaterSurveys_8/333-2-7 Croft Street-HR6 8LA'
+ }
+
+ # We now match this retrofit packages board to the extracted data
+ matching_lookup = []
+ for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)):
+
+ # Handle the case that has the wrong postcode in the asset data
+ if home["Name"] in manual_filters:
+ filtered = retrofit_assessment_data[
+ retrofit_assessment_data["survey_folder"] == manual_filters[home["Name"]]
+ ].copy()
+ else:
+ filtered = retrofit_assessment_data[
+ retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower()
+ ].copy()
+
+ # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
+ to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
+ home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False
+ )
+ if to_filter.sum() == 0:
+ to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.replace(",", "").str.replace(".",
+ "").str.contains(
+ home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False
+ )
+ filtered = filtered[to_filter]
+
+ if filtered.empty:
+ continue
+
+ if filtered.shape[0] == 1:
+ matching_lookup.append(
+ {
+ "survey_folder": filtered["survey_folder"].values[0],
+ "Address ID": home["Address ID"],
+ "Name": home["Name"]
+ }
+ )
+ continue
+
+ # home["Name"] should be contained in the survey_folder
+ filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)]
+ # We have an edge case wher some properties have two outputs in Sharepoint
+ if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ":
+ raise Exception("Fix me1")
+ # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"]
+
+ if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB':
+ raise Exception("Fix me2")
+ # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"]
+
+ if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ':
+ filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"]
+
+ if filtered.empty:
+ continue
+ if filtered.shape[0] != 1:
+ raise Exception("something went wrong")
+
+ matching_lookup.append(
+ {
+ "survey_folder": filtered["survey_folder"].values[0],
+ "Address ID": home["Address ID"],
+ "Name": home["Name"]
+ }
+ )
+ matching_lookup = pd.DataFrame(matching_lookup)
+
+ ccs_coordination = ccs_coordination.rename(
+ columns={"Post Code": "Postcode"}
+ )
+ ccs_coordination = ccs_coordination[~pd.isnull(ccs_coordination["Postcode"])]
+ ccs_coordination = ccs_coordination[ccs_coordination["Retrofit Assessment"] != "Outstanding"]
+
+ ccs_manual_filters = {
+ "35 Kittiwake Close": f"{wave_21_folder_name}/11. CCS Dorset/Kittiwake Close 35"
+ }
+ ccs_matching_lookup = []
+ for _, home in tqdm(ccs_coordination.iterrows(), total=len(ccs_coordination)):
+
+ # Handle the case that has the wrong postcode in the asset data
+ if home["Name"] in ccs_manual_filters:
+ filtered = retrofit_assessment_data[
+ retrofit_assessment_data["survey_folder"] == ccs_manual_filters[home["Name"]]
+ ].copy()
+ else:
+ filtered = retrofit_assessment_data[
+ retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower()
+ ].copy()
+
+ # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
+ to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
+ home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False
+ )
+ if to_filter.sum() == 0:
+ to_filter = (
+ filtered["survey_folder"].
+ str.replace(r"[^\w\s]", "").
+ str.replace(",", "").
+ str.replace(".", "").
+ str.contains(
+ home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False
+ )
+ )
+ if to_filter.sum() == 0:
+ to_filter = (
+ filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").str.lower() ==
+ home["Name"].lower()
+ )
+ if to_filter.sum() == 0:
+ to_filter = (
+ filtered["Address"].str.replace(" ,", "").str.split(",").str[0:1].str.join("").str.lower() ==
+ home["Name"].lower()
+ )
+ if to_filter.sum() == 0:
+ # Do a fuzzy match on the name
+ # Find the best filter
+ to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").apply(
+ lambda x: fuzz.partial_ratio(home["Name"], x) > 93
+ )
+ if to_filter.sum() == 0:
+ # We also some cases where the name of the survey folder is like "Colville Road 7" and the
+ # property name is actually 7 Colville Road, so we try taking the final part of the address,
+ # splitting on space, and adding it to the front
+ def reformat_survey_folder(x):
+ filename = x.split("/")[-1]
+ parts = filename.split(" ")
+ return " ".join(parts[-1:] + parts[:-1])
+
+ to_filter = (
+ filtered["survey_folder"].apply(lambda x: reformat_survey_folder(x)).str.lower() ==
+ home["Name"].lower()
+ )
+
+ if to_filter.sum() == 0:
+ raise Exception("Error")
+ filtered = filtered[to_filter]
+
+ if filtered.empty:
+ continue
+
+ if filtered.shape[0] == 1:
+ ccs_matching_lookup.append(
+ {
+ "survey_folder": filtered["survey_folder"].values[0],
+ "Asset ID.1": home["Asset ID.1"],
+ "Name": home["Name"]
+ }
+ )
+ continue
+
+ raise Exception("No match")
+
+ ccs_matching_lookup = pd.DataFrame(ccs_matching_lookup)
+ # We get a match for all records
+ assert ccs_matching_lookup.shape[0] == ccs_coordination.shape[0]
+ assert not pd.isnull(ccs_matching_lookup["Asset ID.1"]).sum()
+ assert not ccs_matching_lookup["Asset ID.1"].duplicated().sum()
+
+ # We do the same for Wates
+ wates_coordination = wates_coordination.rename(
+ columns={"Post Code": "Postcode"}
+ )
+ wates_coordination = wates_coordination[
+ wates_coordination["Retrofit Assessment"].isin(["Completed"])
+ ]
+ wates_coordination = wates_coordination[
+ ~pd.isnull(wates_coordination["Postcode"])
+ ]
+
+ wates_manual_filters = {
+ "24 Rabley Wood View": f"{wave_21_folder_name}/3. Wiltshire/24-25 Rabley Wood View",
+ "14 Edencroft": f"{wave_21_folder_name}/3. Wiltshire/14 Edencroft",
+ "Flat 31 Rabley Wood View": f"{wave_21_folder_name}/3. Wiltshire/Flat 31 Rabley Wood View",
+ 'Flat 13, Manor Fields': f'{wave_21_folder_name}/1. Herefordshire/(038) Manor Fields Flat 13',
+ "4 Kittys Lane": f"{wave_21_folder_name}/1. Herefordshire/(005) Kittys Lane 4",
+ '1 Jephson Court': f'{wave_21_folder_name}/5. Coventry/Jesphson Court 1',
+ '2 Jephson Court': f'{wave_21_folder_name}/5. Coventry/Jesphson Court 2',
+ }
+ wates_matching_lookup = []
+ # Examples to skip when we cannot get the data
+ wates_to_skip = [
+ "66 Abbatt Close", # File type is unusual, couldn't extract the data
+ "Flat 69 Goddard Road", # Doesn't exist
+ "19 Garth House", # # File type is unusual, couldn't extract the data
+ '5 Gilpin Close', # No properly formatted EPR
+ '49 The Hide, Netherfield', # TODO: TEMP HERE
+ '19 Chanders Rd',
+ '5 Chanders Rd',
+ '23 Chanders Rd',
+ '3 Chanders Rd',
+ '1 Orchard Close',
+ ]
+ wates_coordination = wates_coordination[~wates_coordination["Name"].isin(wates_to_skip)]
+
+ for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)):
+
+ # Search the folder
+ filtered = retrofit_assessment_data[
+ retrofit_assessment_data["survey_folder"].str.contains(home["folder_path"], regex=False)
+ ]
+ if len(filtered) == 1:
+ wates_matching_lookup.append(
+ {
+ "survey_folder": filtered["survey_folder"].values[0],
+ "Asset ID": home["Asset ID"],
+ "Name": home["Name"]
+ }
+ )
+ continue
+
+ if home["Name"] in wates_to_skip:
+ continue
+
+ # Handle the case that has the wrong postcode in the asset data
+ if home["Name"] in wates_manual_filters:
+ filtered = retrofit_assessment_data[
+ retrofit_assessment_data["survey_folder"] == wates_manual_filters[home["Name"]]
+ ].copy()
+ else:
+ filtered = retrofit_assessment_data[
+ retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower()
+ ].copy()
+
+ # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
+ to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
+ home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False
+ )
+
+ if to_filter.sum() > 1:
+ to_filter = (
+ filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.split("/").str[-1].str.lower() ==
+ home["Name"].replace(r"[^\w\s]", "").lstrip().lower()
+ )
+
+ if to_filter.sum() == 0:
+ to_filter = (
+ filtered["survey_folder"].
+ str.replace(r"[^\w\s]", "").
+ str.replace(",", "").
+ str.replace(".", "").
+ str.contains(
+ home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False
+ )
+ )
+ if to_filter.sum() == 0:
+ to_filter = (
+ filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").str.lower() ==
+ home["Name"].lower()
+ )
+ if to_filter.sum() == 0:
+ to_filter = (
+ filtered["Address"].str.replace(" ,", "").str.split(",").str[0:1].str.join("").str.lower() ==
+ home["Name"].lower()
+ )
+ if to_filter.sum() == 0:
+ # Do a fuzzy match on the name
+ # Find the best filter
+ to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").apply(
+ lambda x: fuzz.partial_ratio(home["Name"], x) > 93
+ )
+ if to_filter.sum() == 0:
+ # We also some cases where the name of the survey folder is like "Colville Road 7" and the
+ # property name is actually 7 Colville Road, so we try taking the final part of the address,
+ # splitting on space, and adding it to the front
+ def reformat_survey_folder(x):
+ filename = x.split("/")[-1]
+ parts = filename.split(" ")
+ return " ".join(parts[-1:] + parts[:-1])
+
+ to_filter = (
+ filtered["survey_folder"].apply(lambda x: reformat_survey_folder(x)).str.lower() ==
+ home["Name"].lower()
+ )
+
+ if to_filter.sum() == 0:
+ raise Exception("Error")
+ filtered = filtered[to_filter]
+
+ if filtered.empty:
+ continue
+
+ if filtered.shape[0] == 1:
+ wates_matching_lookup.append(
+ {
+ "survey_folder": filtered["survey_folder"].values[0],
+ "Asset ID": home["Asset ID"],
+ "Name": home["Name"]
+ }
+ )
+ continue
+
+ raise Exception("No match")
+ wates_matching_lookup = pd.DataFrame(wates_matching_lookup)
+
+ # We get a match for all records
+ assert wates_matching_lookup.shape[0] == wates_coordination.shape[0]
+ assert not pd.isnull(wates_matching_lookup["Asset ID"]).sum()
+ assert not wates_matching_lookup["Asset ID"].duplicated().sum()
+
+ # Merge lookup tables onto the coordination sheets
+ wates_coordination = wates_coordination.merge(
+ wates_matching_lookup, how="left", on="Name"
+ )
+ missed_asset_id = wates_coordination[pd.isnull(wates_coordination["Asset ID_x"])]
+ if not missed_asset_id.empty:
+ raise Exception("Missing Asset ID")
+
+ if wates_coordination["Asset ID_x"].duplicated().sum():
+ raise Exception("Duplicated IDs in wates")
+
+ # We merge the mpt data on to the wates coordination
+ wates_coordination = wates_coordination.merge(
+ mtp_df, how="left", on="survey_folder"
+ )
+
+ ccs_coordination = ccs_coordination.merge(
+ ccs_matching_lookup, how="left", on="Name"
+ )
+ ccs_coordination = ccs_coordination.merge(
+ mtp_df, how="left", on="survey_folder"
+ )
+
+ retrofit_packages_board = retrofit_packages_board.merge(
+ matching_lookup, how="left", on="Name"
+ )
+
+ # We now map the retrofit assessment data to the coordinated packages
+ wates_coordination = wates_coordination.merge(
+ retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder"
+ )
+ ccs_coordination = ccs_coordination.merge(
+ retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder"
+ )
+ retrofit_packages_board = retrofit_packages_board.merge(
+ retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder"
+ )
+
+ # We have 4 properties in the Wates coordination board, that we want to remove from the retrofit packages board
+ to_remove = wates_coordination[
+ wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"])
+ ]
+ assert to_remove.shape[0] == 4
+ # Remove them from the wates board
+ wates_coordination = wates_coordination[
+ ~wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"])
+ ]
+
+ # We combine this into a singular board
+ coordinated_packages = pd.concat(
+ [
+ retrofit_packages_board[
+ [
+ "Name", "Postcode", 'Actual SAP Band', 'Actual SAP Rating',
+ 'Modelled SAP Band', 'Modelled SAP Rating', 'Package Ref',
+ 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation',
+ 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade',
+ 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls',
+ 'Solar PV', 'Other measures', 'Organisation Reference',
+ ] + retrofit_assessments_data_columns_prefixed
+ ],
+ ccs_coordination[
+ [
+ # We don't have secondary wall insulation, Flat Roof, RIR, Heating Controls,
+ # Solar PV
+ "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package',
+ 'SAP Band Install Package', 'Package Approved (Client)',
+ 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade',
+ 'Ventilation', 'Heating', 'Other Measures', 'PV System',
+ "Asset ID.1_y",
+ ] + retrofit_assessments_data_columns_prefixed
+ ].rename(
+ columns={
+ "SAP Band Pre": "Actual SAP Band",
+ "SAP Rating Pre": "Actual SAP Rating",
+ 'SAP Rating Install Package': 'Modelled SAP Band',
+ 'SAP Band Install Package': 'Modelled SAP Rating',
+ 'Package Approved (Client)': 'Package Ref',
+ 'Wall Insulation': 'Main Wall Insulation',
+ 'Loft Insulation': 'Loft insulation',
+ 'Windows Upgrade': 'Window Upgrade',
+ 'Ext. Doors Upgrade': 'Door Upgrade',
+ 'Heating': 'Main Heating',
+ 'Other Measures': 'Other measures',
+ 'Asset ID.1_y': 'Organisation Reference',
+ "PV System": "Solar PV",
+ }
+ ),
+ wates_coordination[
+ [
+ "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package',
+ 'SAP Band Install Package', 'Package Approved (Client)',
+ 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade',
+ 'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x', "PV System"
+ ] + retrofit_assessments_data_columns_prefixed
+ ].rename(
+ columns={
+ "SAP Band Pre": "Actual SAP Band",
+ "SAP Rating Pre": "Actual SAP Rating",
+ 'SAP Rating Install Package': 'Modelled SAP Band',
+ 'SAP Band Install Package': 'Modelled SAP Rating',
+ 'Package Approved (Client)': 'Package Ref',
+ 'Wall Insulation': 'Main Wall Insulation',
+ 'Loft Insulation': 'Loft insulation',
+ 'Windows Upgrade': 'Window Upgrade',
+ 'Ext. Doors Upgrade': 'Door Upgrade',
+ 'Heating': 'Main Heating',
+ 'Other Measures': 'Other measures',
+ 'Asset ID_x': 'Organisation Reference',
+ "PV System": "Solar PV",
+ }
+ )
+ ]
+ )
+
+ coordinated_packages["Organisation Reference"] = coordinated_packages["Organisation Reference"].astype(int)
+ assert not coordinated_packages["Organisation Reference"].duplicated().sum()
+
+ # Merge the property features on
+ coordinated_packages = coordinated_packages.merge(
+ features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]],
+ how="left",
+ on="Organisation Reference"
+ )
+
+ coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Survey: Current EPC Band"])]
+ coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Survey: Current SAP Rating"])]
+
+ # We need the features pertaining to these priority postcodes
+
+ def find_nearest_matching_property(coordinated_packages, home):
+ filter_levels = [
+ (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2),
+ (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3),
+ (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 4),
+ (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 5),
+ (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 6),
+ (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 7),
+ ]
+
+ max_confidence = max([confidence for (_, confidence) in filter_levels])
+
+ for i, (filters, match_confidence) in enumerate(filter_levels):
+ match = coordinated_packages.copy()
+
+ for col in filters:
+ match = match[match[col] == home[col]]
+
+ if not match.empty:
+ return match, match_confidence
+
+ # Finally, we search for a property in the same Archetype
+ match = coordinated_packages[coordinated_packages["Archetype ID"] == home["Archetype ID"]]
+ if not match.empty:
+ return match, max_confidence + 1
+
+ return None, None # No match found
+
+ coordinated_packages["Postal Region"] = coordinated_packages["Postcode"].str.split(" ").str[0].str.strip()
+ new_priority_postcodes["Postal Region"] = new_priority_postcodes["Postcode"].str.split(" ").str[0].str.strip()
+
+ coordinated_packages["Roof Simple"] = coordinated_packages["Roofs"].str.split(":").str[0].str.strip()
+ new_priority_postcodes["Roof Simple"] = new_priority_postcodes["Roofs"].str.split(":").str[0].str.strip()
+
+ coordinated_packages["Primary Property Type"] = coordinated_packages["Property Type"].str.split(":").str[0]
+ new_priority_postcodes["Primary Property Type"] = new_priority_postcodes["Property Type"].str.split(":").str[0]
+
+ coordinated_packages = coordinated_packages.merge(
+ new_priority_postcodes[["Organisation Reference", "Archetype ID"]],
+ how="left",
+ on="Organisation Reference"
+ )
+
+ # For every property in the priority postcodes data, we look for a most appropriate matching property
+ no_match = []
+ matches = []
+ for _, home in tqdm(new_priority_postcodes.iterrows(), total=len(new_priority_postcodes)):
+ # We check if the property was surveyed
+ survey_result = coordinated_packages[
+ coordinated_packages["Organisation Reference"] == home["Organisation Reference"]
+ ]
+ if not survey_result.empty:
+ to_extend = [
+ {
+ "Organisation Reference": home["Organisation Reference"],
+ "Best Match Organisation Reference": m,
+ "match_confidence": 1,
+ "Was Surveyed": True
+ } for m in survey_result["Organisation Reference"].values
+ ]
+ matches.extend(to_extend)
+ continue
+
+ closest_match, match_confidence = find_nearest_matching_property(coordinated_packages, home)
+ if closest_match is None:
+ no_match.append(home["Organisation Reference"])
+ continue
+
+ to_extend = [
+ {
+ "Organisation Reference": home["Organisation Reference"],
+ "Best Match Organisation Reference": m,
+ "match_confidence": match_confidence,
+ "Was Surveyed": False
+ } for m in closest_match["Organisation Reference"].values
+ ]
+ matches.extend(to_extend)
+
+ no_match_summary = new_priority_postcodes[
+ new_priority_postcodes["Organisation Reference"].isin(
+ no_match
+ )
+ ].groupby(["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"])[
+ "Organisation Reference"].count().reset_index()
+
+ no_match_summary = no_match_summary.sort_values("Organisation Reference", ascending=False)
+
+ # len(no_match)
+ # 8764, 5607, 5646, 5071
+ # no_match_summary.shape
+ # (3953, 6), (2948, 6), (2969, 7), (2575, 7)
+
+ matches_df = pd.DataFrame(matches)
+
+ matches_df = matches_df.merge(
+ coordinated_packages[["Organisation Reference", "Survey: Current EPC Band", "Survey: Current SAP Rating"]],
+ left_on="Best Match Organisation Reference", right_on="Organisation Reference",
+ suffixes=("", " - Closest Match")
+ )
+
+ measures_columns = [
+ 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation',
+ 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade',
+ 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls',
+ 'Solar PV', 'Other measures'
+ ]
+
+ # We want to aggregate the matches, when we have multiple
+ aggregated_matches_df = []
+ for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"):
+
+ measures = coordinated_packages[
+ (
+ coordinated_packages["Organisation Reference"].isin(
+ mapped_matches['Best Match Organisation Reference'].values
+ )
+ )
+ ][measures_columns]
+
+ if mapped_matches.shape[0] == 1:
+ # Get the measures for this property
+ measures = measures.squeeze()
+
+ aggregated_matches_df.append(
+ {
+ "Organisation Reference": org_ref,
+ "Number of matches": 1,
+ "Proportion": 100,
+ "Estimated SAP Rating": mapped_matches["Survey: Current SAP Rating"].values[0],
+ "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0],
+ "Was Surveyed": mapped_matches["Was Surveyed"].values[0],
+ **measures
+ }
+ )
+ continue
+
+ # We need to aggregate the matches, since we have multiple
+ average_rating = mapped_matches["Survey: Current SAP Rating"].mean()
+ number_of_matches = mapped_matches.shape[0]
+ average_epc_rating = sap_to_epc(average_rating)
+ # proportion is the number of properties that have this EPC rating
+ proportion_with_this_epc = int(
+ mapped_matches[mapped_matches["Survey: Current EPC Band"] == average_epc_rating].shape[
+ 0] / number_of_matches * 100
+ )
+
+ measures_aggregated = {}
+ for m in measures_columns:
+ if any(~pd.isnull(measures[m])):
+ # Check if we have 2 unique values
+ vals = measures[~pd.isnull(measures[m])][m].unique()
+ if len(vals) > 1:
+ measures_aggregated[m] = ", ".join(vals)
+ else:
+ measures_aggregated[m] = vals[0]
+
+ aggregated_matches_df.append(
+ {
+ "Organisation Reference": org_ref,
+ "Number of matches": number_of_matches,
+ "Proportion": proportion_with_this_epc,
+ "Estimated SAP Rating": average_rating,
+ "Estimated EPC Rating": average_epc_rating,
+ "Was Surveyed": False,
+ **measures_aggregated
+ }
+ )
+
+ aggregated_matches_df = pd.DataFrame(aggregated_matches_df)
+
+ mapped_priority_list = new_priority_postcodes.merge(
+ aggregated_matches_df, on="Organisation Reference", how="left"
+ )
+
+ mapped_priority_list["address1"] = mapped_priority_list["Address"].str.split(",").str[0]
+
+ # If we have a leading number like 01, 02, 03, 04, 05, 06, 07, 08, 09, we remove the leading 0
+
+ def remove_leading_zero(address):
+ return re.sub(r"^0([1-9]) ", r"\1 ", address)
+
+ mapped_priority_list["address1"] = mapped_priority_list["address1"].apply(remove_leading_zero)
+ mapped_priority_list["address1"] = np.where(
+ mapped_priority_list["Organisation Reference"] == 37004,
+ "8 Mason Road",
+ mapped_priority_list["address1"]
+ )
+ mapped_priority_list["address1"] = np.where(
+ mapped_priority_list["Organisation Reference"] == 37003,
+ "9 Mason Road",
+ mapped_priority_list["address1"]
+ )
+
+ mapped_priority_list = mapped_priority_list.rename(
+ columns={"UPRN": "uprn"}
+ )
+ mapped_priority_list["row_id"] = mapped_priority_list["Organisation Reference"]
+
+ # Flag where 2 out of the three columns have consensus
+ mapped_priority_list["2 of 3 Data Sources Have Consensus on EPC"] = (
+ (mapped_priority_list["SAP Band"] == mapped_priority_list["EPC Band"]) |
+ (mapped_priority_list["SAP Band"] == mapped_priority_list["Estimated EPC Rating"]) |
+ (mapped_priority_list["EPC Band"] == mapped_priority_list["Estimated EPC Rating"])
+ )
+
+ # Let's get the newest EPC data for these properties
+ # We merge on UPRN, when we have it
+ # from etl.route_march_data_pull.app import get_data
+ # epc_data, errors, nodata = get_data(
+ # asset_list=mapped_priority_list,
+ # fulladdress_column="Address",
+ # address1_column="address1",
+ # postcode_column="Postcode",
+ # manual_uprn_map={},
+ # epc_api_only=True
+ # )
+ #
+ # epc_df = pd.DataFrame(epc_data)
+ # epc_df.to_csv(
+ # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "full_epc_data.csv"), index=False
+ # )
+ epc_df = pd.read_csv(os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "full_epc_data.csv"))
+ epc_df = epc_df.rename(columns={"row_id": "Organisation Reference"})
+
+ # We now package up the data
+
+ # Sheet 1 is the base coordination data
+ output_coordination_sheet = coordinated_packages[
+ [
+ "Name", "Postcode", 'Organisation Reference', 'Package Ref',
+ 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation',
+ 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade',
+ 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls',
+ 'Solar PV', 'Other measures',
+ 'Survey: Current SAP Rating',
+ 'Survey: Current EPC Band',
+ 'Survey: Primary Energy Use (kWh/yr)',
+ 'Survey: Primary Energy Use Intensity (kWh/m2/yr)',
+ 'Survey: Number of Storeys', 'Survey: Fuel Bill',
+ 'Survey: Window Age Description',
+ 'Survey: Window Age Description Proportion (%)',
+ 'Survey: Secondary Window Age Description',
+ 'Survey: Secondary Window Age Description Proportion (%)',
+ 'Survey: Number of Windows', 'Survey: Total Number of Doors',
+ 'Survey: Number of Insulated Doors',
+ 'Survey: Existing Primary Heating System',
+ 'Survey: Existing Primary Heating PCDF Reference',
+ 'Survey: Existing Primary Heating Controls',
+ 'Survey: Existing Primary Heating % of Heat',
+ 'Survey: Existing Secondary Heating System',
+ 'Survey: Existing Secondary Heating PCDF Reference',
+ 'Survey: Existing Secondary Heating Controls',
+ 'Survey: Existing Secondary Heating % of Heat',
+ 'Survey: Secondary Heating Code', 'Survey: Water Heating Code',
+ 'Survey: Total Floor Area (m2)', 'Survey: Total Ground Floor Area (m2)',
+ 'Survey: RIR Floor Area', 'Survey: Main Building Wall Area (m2)',
+ 'Survey: First Extension Wall Area (m2)',
+ 'Survey: Number of Light Fittings', 'Survey: Number of LEL Fittings',
+ 'Survey: Number of fittings needing LEL', 'Survey: Main Roof Type',
+ 'Survey: Main Roof Insulation',
+ 'Survey: Main Roof Insulation Thickness', 'Survey: Main Wall Type',
+ 'Survey: Main Wall Insulation', 'Survey: Main Wall Dry-lining',
+ 'Survey: Main Wall Thickness',
+ 'Survey: Main Building Alternative Wall Type',
+ 'Survey: Main Building Alternative Wall Insulation',
+ 'Survey: Main Building Alternative Wall Dry-lining',
+ 'Survey: Main Building Alternative Wall Thickness',
+ 'Survey: Main Fuel',
+ 'Survey: Main Building Age Band',
+ 'Walls', 'Roofs', 'Heating', 'Main Fuel', 'Age', 'Property Type'
+ ]
+ ].rename(
+ columns={
+ 'Walls': "Parity - Walls",
+ 'Roofs': "Parity - Roof",
+ 'Heating': "Parity - Heating",
+ 'Main Fuel': "Parity - Fuel",
+ 'Age': "Parity - Age Band",
+ 'Property Type': "Parity - Property Type"
+ }
+ )
+
+ # Sheet 2 is the lookup table which maps the properties to their closest match
+ # We need to bring in the parity attributes between the mapped properties so we can see side-by-side
+ mapped_lookup = matches_df[
+ [
+ 'Organisation Reference',
+ 'Best Match Organisation Reference',
+ 'Survey: Current EPC Band',
+ 'Survey: Current SAP Rating',
+ "Was Surveyed",
+ "match_confidence",
+ ]
+ ].rename(
+ columns={
+ 'Best Match Organisation Reference': "Best Match - Organisation Reference",
+ "Survey: Current EPC Band": "Best Match - Survey: Current EPC Band",
+ 'Survey: Current SAP Rating': "Best Match - Survey: Current SAp Rating"
+ }
+ ).merge(
+ features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type",
+ "Total Floor Area"]],
+ how="left",
+ on="Organisation Reference"
+ ).merge(
+ features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type",
+ "Total Floor Area"]].rename(
+ columns={
+ "Organisation Reference": "Best Match - Organisation Reference",
+ "Walls": "Best Match - Walls",
+ "Roofs": "Best Match - Roof",
+ "Heating": "Best Match - Heating",
+ "Main Fuel": "Best Match - Main Fuel",
+ "Age": "Best Match - Age",
+ "Property Type": "Best Match - Property Type",
+ "Total Floor Area": "Best Match - Total Floor Area"
+ }
+ ),
+ how="left",
+ on="Best Match - Organisation Reference"
+ ).merge(
+ coordinated_packages[
+ [
+ "Organisation Reference", 'Survey: Main Wall Type', 'Survey: Main Wall Insulation',
+ 'Survey: Main Roof Type', 'Survey: Main Roof Insulation', 'Survey: Main Roof Insulation Thickness',
+ 'Survey: Existing Primary Heating System', 'Survey: Main Building Age Band',
+ 'Survey: Main Building Wall Area (m2)', 'Survey: Total Floor Area (m2)',
+ 'Survey: Main Building Age Band',
+ ]
+ ].rename(
+ columns={
+ "Organisation Reference": "Best Match - Organisation Reference",
+ 'Survey: Main Wall Type': 'Best Match - Survey: Main Wall Type',
+ 'Survey: Main Wall Insulation': 'Best Match - Survey: Main Wall Insulation',
+ 'Survey: Main Roof Type': 'Best Match - Survey: Main Roof Type',
+ 'Survey: Main Roof Insulation': 'Best Match - Survey: Main Roof Insulation',
+ 'Survey: Main Roof Insulation Thickness': 'Best Match - Survey: Main Roof Insulation Thickness',
+ 'Survey: Existing Primary Heating System': 'Best Match - Survey: Existing Primary Heating System',
+ }
+ ),
+ how="left",
+ on="Best Match - Organisation Reference"
+ )
+
+ # Finally, we have the property, against the mapped home with the estimate SAP scores and the EPC data
+ worksheet = mapped_priority_list[
+ [
+ 'Organisation Reference', 'Address', 'Postcode', 'Address ID', 'uprn', 'Archetype ID',
+ 'SAP', 'SAP Band', "Property Type", "Walls", "Roofs", 'Glazing',
+ 'Heating', 'Main Fuel', 'Hot Water', 'Number of matches', 'Proportion',
+ 'Estimated SAP Rating', 'Estimated EPC Rating', "Was Surveyed",
+ 'Main Wall Insulation',
+ 'Secondary Wall Insulation', 'Loft insulation', 'Flat Roof',
+ 'Room in Roof', 'Window Upgrade', 'Door Upgrade', 'Ventilation',
+ 'Main Heating', 'Water Heating', 'Heating Controls', 'Solar PV',
+ 'Other measures', "2 of 3 Data Sources Have Consensus on EPC"
+ ]
+ ].rename(
+ columns={
+ "SAP": "Parity - SAP Rating",
+ "SAP Band": "Parity - EPC Rating",
+ "Property Type": "Parity - Property Type",
+ "Walls": "Parity - Walls",
+ "Roofs": "Parity - Roofs",
+ 'Glazing': "Parity - Glazing",
+ 'Heating': 'Parity - Heating',
+ 'Main Fuel': 'Parity - Main Fuel',
+ 'Hot Water': 'Parity - Hot Water',
+ 'Proportion': 'Proportion of matched properties with same EPC rating',
+ }
+ ).merge(
+ epc_df[
+ [
+ "Organisation Reference",
+ "uprn",
+ "current-energy-efficiency",
+ "current-energy-rating",
+ "lodgement-date",
+ "construction-age-band",
+ "walls-description",
+ "roof-description",
+ "mainheat-description",
+ "windows-description",
+ "hotwater-description",
+ "main-fuel",
+ "total-floor-area",
+ ]
+ ].rename(
+ columns={
+ "uprn": "Last EPC - uprn",
+ "current-energy-efficiency": "Last EPC - SAP Score",
+ "current-energy-rating": "Last EPC - EPC Rating",
+ "lodgement-date": "Last EPC - Date Lodged",
+ "construction-age-band": "Last EPC - Age Band",
+ "walls-description": "Last EPC - Walls",
+ "roof-description": "Last EPC - Roof",
+ "mainheat-description": "Last EPC - Heating",
+ "windows-description": "Last EPC - Windows",
+ "hotwater-description": "Last EPC - Hot Water",
+ "main-fuel": "Last EPC - Main Fuel",
+ "total-floor-area": "Last EPC - Total Floor Area"
+ }
+ ),
+ how="left",
+ on='Organisation Reference'
+ )
+
+ worksheet["Years Since Last EPC"] = pd.Timestamp.now().year - pd.to_datetime(
+ worksheet["Last EPC - Date Lodged"]).dt.year
+
+ worksheet["Last EPC - uprn"] = worksheet["Last EPC - uprn"].astype("Int64").astype(str)
+
+ worksheet["uprn"] = np.where(
+ pd.isnull(worksheet["uprn"]) & pd.notnull(worksheet["Last EPC - uprn"]),
+ worksheet["Last EPC - uprn"],
+ worksheet["uprn"]
+ )
+
+ worksheet["uprn"] = worksheet["uprn"].replace("", "")
+
+ worksheet = worksheet.drop(columns=["Last EPC - uprn"])
+
+ # Save to Excel with multiple sheets
+ excel_path = os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "13022025 Stonewater Priority List.xlsx")
+ with pd.ExcelWriter(excel_path, engine="xlsxwriter") as writer:
+ worksheet.to_excel(writer, sheet_name="Worksheet", index=False, header=True)
+ mapped_lookup.to_excel(writer, sheet_name="Lookup Table", index=False, header=True)
+ output_coordination_sheet.to_excel(writer, sheet_name="Coordination", index=False, header=True)
+
+# if __name__ == "__main__":
+# main()
diff --git a/etl/customers/stonewater/data_cleaning.py b/etl/customers/stonewater/data_cleaning.py
new file mode 100644
index 00000000..eedae9b9
--- /dev/null
+++ b/etl/customers/stonewater/data_cleaning.py
@@ -0,0 +1,155 @@
+import os
+import shutil
+from tqdm import tqdm
+from etl.access_reporting.app import SharePointClient
+
+
+def delete_large_files():
+ """
+ This function deletes photos, designs and other files which we don't need
+ :return:
+ """
+
+ folder_path = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 2.1 Surveys"
+
+ # List the contents of this folder since in each sub-folder we have the property folders
+ contents = os.listdir(folder_path)
+
+ for subfolder in contents:
+ if not os.path.isdir(os.path.join(folder_path, subfolder)):
+ continue
+ subfolder_path = os.path.join(folder_path, subfolder)
+ # List the contents
+ property_folders = os.listdir(subfolder_path)
+
+ for property in tqdm(property_folders):
+ # Check if it's a directory
+ if not os.path.isdir(os.path.join(subfolder_path, property)):
+ continue
+
+ property_path = os.path.join(subfolder_path, property)
+ property_contents = os.listdir(property_path)
+ # We delete the contents of the following folders:
+ # '1. RA Property Pics'
+ # '4. Air Tightness Tests'
+ # '5. RD Design Info'
+ for folder_to_delete in ["1. RA Property Pics", "4. Air Tightness Tests", "5. RD Design Info",
+ "1. RA Property PIcs", "Post EPC Photos", "4. RD Design Info",
+ "5. Installer Info", "6. Trustmark lodgement", "7.Post Install Inspection Photos",
+ "6. Trustmark Lodgement", "7. Post Inspection Photos"]:
+ if folder_to_delete not in property_contents:
+ continue
+ folder_to_delete_path = os.path.join(property_path, folder_to_delete)
+ if os.path.isdir(folder_to_delete_path):
+ # Delete the folder, even if it's not empty
+ shutil.rmtree(folder_to_delete_path)
+
+ # We now check the '2. RA Coordinator Info' folder for any .MOV files and delete them
+ if "2. RA Coordinator Info" not in property_contents:
+ coordinator_folder = "1. RA Coordinator Info"
+ else:
+ coordinator_folder = "2. RA Coordinator Info"
+ coordinator_info_path = os.path.join(property_path, coordinator_folder)
+ coordinator_info_contents = os.listdir(coordinator_info_path)
+ # Look for .MOV files and .jpg files
+ for file in coordinator_info_contents:
+ if file.endswith(".MOV"):
+ os.remove(os.path.join(coordinator_info_path, file))
+
+ if file.endswith(".jpg"):
+ os.remove(os.path.join(coordinator_info_path, file))
+
+ if "Property Pics" in coordinator_info_contents:
+ # Delete folder and contents
+ shutil.rmtree(os.path.join(coordinator_info_path, "Property Pics"))
+
+
+def download_data_from_sharepoint():
+ # Given a sharepoint location, this function will download the retrofit assessment folders from the locations
+ # specified in the sharepoint location
+
+ SHAREPOINT_CLIENT_ID = os.getenv("SHAREPOINT_CLIENT_ID", None)
+ SHAREPOINT_CLIENT_SECRET = os.getenv("SHAREPOINT_CLIENT_SECRET", None)
+ SHAREPOINT_TENANT_ID = os.getenv("SHAREPOINT_TENANT_ID", None)
+ OSMOSIS_SHAREPOINT_SITE_ID = os.getenv("OSMOSIS_SHAREPOINT_SITE_ID", None)
+
+ sharepoint_client = SharePointClient(
+ tenant_id=SHAREPOINT_TENANT_ID,
+ client_id=SHAREPOINT_CLIENT_ID,
+ client_secret=SHAREPOINT_CLIENT_SECRET,
+ site_id=OSMOSIS_SHAREPOINT_SITE_ID
+ )
+
+ # Retrieve the data from Sharepoint and write to local machine
+ contents = sharepoint_client.list_folder_contents(
+ drive_id=sharepoint_client.document_drive["id"],
+ folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders"
+ )
+
+ folders_to_keep = [
+ "1. Herefordshire", "2. Bedfordshire", "3. Wiltshire", "4. Bournemouth",
+ "5. Coventry", "6. West Sussex", "7. Dorset", "8. Cambridgeshire",
+ "9. Guildford", "10. Little Island", "11. CCS Dorset",
+ ]
+
+ folders_to_pull = [
+ folder for folder in contents["value"] if folder["name"] in folders_to_keep
+ ]
+ for folder_to_pull in folders_to_pull:
+
+ # Get the contents
+ folder_contents = sharepoint_client.list_folder_contents(
+ drive_id=sharepoint_client.document_drive["id"],
+ folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" + "/" +
+ folder_to_pull["name"],
+ page_size=100
+ )
+
+ property_folders = [f for f in folder_contents["value"]]
+
+ for property_folder in property_folders:
+ # We go into each property folder and get the contents
+ property_folder_contents = sharepoint_client.list_folder_contents(
+ drive_id=sharepoint_client.document_drive["id"],
+ folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" + "/" +
+ folder_to_pull["name"] + "/" + property_folder["name"]
+ )
+ if not property_folder_contents.get("value"):
+ continue
+ # We look for the retrofit assessment folder or mtp folders:
+ property_sub_folders = [
+ f for f in property_folder_contents["value"] if
+ "ra coordinator info" in f["name"].lower() or
+ "retrofit assessment" in f["name"].lower() or
+ "ra info" in f["name"].lower() or
+ "mtp" in f["name"].lower() or
+ "mid-term" in f["name"].lower()
+ ]
+
+ if not property_sub_folders:
+ continue
+
+ for property_sub_folder in property_sub_folders:
+ # if we have this, we download the folder and store it on my laptop!
+
+ property_folder_path = os.path.join(
+ "Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders",
+ folder_to_pull["name"],
+ property_folder["name"],
+ property_sub_folder["name"]
+ )
+
+ download_dir = os.path.join(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 2.1 Surveys - 2",
+ folder_to_pull["name"],
+ property_folder["name"],
+ property_sub_folder["name"]
+ )
+
+ # We download the folder
+ sharepoint_client.download_sharepoint_folder(
+ drive_id=sharepoint_client.document_drive["id"],
+ folder_path=property_folder_path,
+ download_dir=download_dir,
+ excluded_file_types=["MOV", "jpg"]
+ )
diff --git a/etl/customers/stonewater/potential_eco_properties.py b/etl/customers/stonewater/potential_eco_properties.py
new file mode 100644
index 00000000..6666ce15
--- /dev/null
+++ b/etl/customers/stonewater/potential_eco_properties.py
@@ -0,0 +1,542 @@
+import os
+import time
+import json
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from dotenv import load_dotenv
+from backend.SearchEpc import SearchEpc
+from utils.s3 import read_from_s3, read_pickle_from_s3
+import msoffcrypto
+from io import BytesIO
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+
+def get_data(asset_list):
+ epc_data = []
+ errors = []
+ for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
+ try:
+ postcode = home["Postcode"]
+ house_number = home["Number"]
+ full_address = home["Full Address"]
+
+ searcher = SearchEpc(
+ address1=str(house_number),
+ postcode=postcode,
+ auth_token=EPC_AUTH_TOKEN,
+ os_api_key="",
+ property_type=None,
+ fast=True,
+ full_address=full_address,
+ max_retries=5
+ )
+ # Force the skipping of estimating the EPC
+ searcher.ordnance_survey_client.property_type = None
+ searcher.ordnance_survey_client.built_form = None
+
+ searcher.find_property(skip_os=True)
+ if searcher.newest_epc is None:
+ continue
+
+ # Look for EPC recommendatons
+ try:
+ property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
+ except:
+ property_recommendations = {"rows": []}
+
+ epc = {
+ "row_id": home["row_id"],
+ **searcher.newest_epc.copy(),
+ "recommendations": property_recommendations["rows"]
+ }
+
+ epc_data.append(epc)
+ except Exception as e:
+ errors.append(home["row_id"])
+ time.sleep(5)
+
+ return epc_data, errors
+
+
+def app():
+ """
+ This code creates a list of cavity properties, for review
+ """
+
+ # Read in the password protected master
+ # TODO: This file should be deleted!
+
+ # Path to the password-protected Excel file
+ file_path = ("/Users/khalimconn-kowlessar/Downloads/STONEWATER MASTER SHEET - UPDATED 20.5.24 - K- PASSWORD "
+ "PROTECTED.xlsx")
+ password = "STONE123" # Replace with the actual password
+
+ # Open the file and decrypt it
+ with open(file_path, "rb") as f:
+ decrypted_file = BytesIO()
+ office_file = msoffcrypto.OfficeFile(f)
+ office_file.load_key(password=password)
+ office_file.decrypt(decrypted_file)
+
+ # Read the decrypted file into a DataFrame
+ eco_rolling_master = pd.read_excel(decrypted_file, sheet_name="Sheet1", engine="openpyxl")
+
+ eco_rolling_master = eco_rolling_master[
+ ~eco_rolling_master['INSTALL/CANCELLATION DATE'].str.contains("CANCELLED")
+ ]
+
+ archetyped_properties = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 - "
+ "Archetyped V3.1.xlsx",
+ header=4
+ )
+
+ cavity_descriptions = [
+ "Cavity: AsBuilt (1983-1995)",
+ "Cavity: AsBuilt (Post 1995)",
+ "Cavity: AsBuilt (Pre 1976)",
+ "Cavity: AsBuilt (1976-1982)",
+ ]
+
+ archetyped_properties["Is Cavity Property"] = archetyped_properties["Wall Type"].isin(cavity_descriptions)
+ # We also identify any properties where properties were found to need cavity wall insulation
+
+ costed_packages = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater - Costed Retrofit Packages "
+ "20241030 (WIP) Single Model V2.xlsx",
+ sheet_name="Modelled Packages",
+ header=13
+ )
+
+ needs_cwi = costed_packages[
+ costed_packages["Main Wall Insulation"].isin(
+ [
+ "Poss Extract CWI & Refill (issues identified)",
+ "CWI RdSAP Default"
+ ]
+ )
+ ][["Address ID", "Address", "Current SAP Rating", "Current EPC Band", "Postcode", "Archetype ID",
+ "Main Wall Insulation",
+ "Main Roof Type", "Main Roof Insulation", "Main Roof Insulation Thickness"]]
+
+ # We flag these properties
+ archetyped_properties["Survey shows CWI needed for Archetype"] = archetyped_properties["Archetype ID"].isin(
+ needs_cwi["Archetype ID"]
+ )
+
+ archetyped_properties = archetyped_properties[~pd.isnull(archetyped_properties["Address ID"])]
+ archetyped_properties = archetyped_properties[archetyped_properties["Address ID"] != "Address ID"]
+
+ # this is the big list!!!
+ features = pd.read_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - "
+ "master sheet.csv",
+ encoding='latin1'
+ )
+ features["Address ID"] = features["Address ID"].astype(str)
+
+ features_to_merge = features[
+ [
+ "Address ID", "Organisation Reference", "Age", "Property Type", "Walls", "Roofs", "Glazing", "Heating",
+ "Main Fuel",
+ "Hot Water",
+ "Renewables", "Total Floor Area"
+ ]
+ ]
+
+ stonewater_cavity_properties = archetyped_properties[
+ ["Name", "Postcode", "Osm. ID", "Org. ref.", "Address ID", "UPRN", "UDPRN", "Archetype ID", "House no",
+ "Street name",
+ "Address line 2", "City/Town", "Is Cavity Property", "Survey shows CWI needed for Archetype"]
+ ].merge(
+ features_to_merge, how="left", on="Address ID"
+ )
+
+ # We filter this down to the properties that are cavity properties
+ stonewater_cavity_properties = stonewater_cavity_properties[
+ stonewater_cavity_properties["Is Cavity Property"] |
+ stonewater_cavity_properties["Survey shows CWI needed for Archetype"]
+ ]
+
+ stonewater_cavity_properties["Reason Included"] = "As Built Cavity Property"
+ stonewater_cavity_properties["Reason Included"] = np.where(
+ stonewater_cavity_properties["Survey shows CWI needed for Archetype"] &
+ ~stonewater_cavity_properties["Is Cavity Property"],
+ "Survey revealed potential need for CWI or extract and re-fill",
+ stonewater_cavity_properties["Reason Included"]
+ )
+ stonewater_cavity_properties["Reason Included"] = np.where(
+ stonewater_cavity_properties["Survey shows CWI needed for Archetype"] &
+ stonewater_cavity_properties["Is Cavity Property"],
+ "Surveyed revealed potential need for CWI or extract and re-fill and is an as built cavity property",
+ stonewater_cavity_properties["Reason Included"]
+ )
+ # We indicate the exact properties that need CWI, based on survey findings
+ stonewater_cavity_properties["Reason Included"] = np.where(
+ stonewater_cavity_properties["Address ID"].isin(
+ needs_cwi[needs_cwi["Main Wall Insulation"] == "CWI RdSAP Default"]["Address ID"].astype(int).astype(
+ str).values
+ ),
+ "Survey showed this property needs CWI",
+ stonewater_cavity_properties["Reason Included"]
+ )
+
+ stonewater_cavity_properties["Reason Included"] = np.where(
+ stonewater_cavity_properties["Address ID"].isin(
+ needs_cwi[needs_cwi["Main Wall Insulation"] == "Poss Extract CWI & Refill (issues identified)"][
+ "Address ID"].astype(int).astype(str).values
+ ),
+ "Survey showed this property could need extract and re-fill",
+ stonewater_cavity_properties["Reason Included"]
+ )
+
+ # We flag units that were installed under ECO3
+ numeric_ids = eco_rolling_master[eco_rolling_master["STONEWATER UPRN"] != "NOT ON ASSET LIST"]
+ numeric_ids = numeric_ids[~pd.isnull(numeric_ids["STONEWATER UPRN"])]
+ numeric_ids["STONEWATER UPRN"] = numeric_ids["STONEWATER UPRN"].astype(int)
+
+ stonewater_cavity_properties["Installed under ECO3"] = stonewater_cavity_properties["Org. ref."].isin(
+ numeric_ids['STONEWATER UPRN'].values
+ )
+
+ # Which postcodes were installed under ECO3
+ priority_list_eco3 = stonewater_cavity_properties[
+ stonewater_cavity_properties["Installed under ECO3"]
+ ]["Postcode"].unique()
+
+ # These are properties that were not installed under ECO3, that have the same postcodes as properties
+ # installed under ECO3
+
+ # These are 66 properties we might want to start with as an immediate priority
+ stonewater_cavity_properties["Same Postcode as Installed under ECO3"] = (
+ ~stonewater_cavity_properties["Installed under ECO3"] & (
+ stonewater_cavity_properties["Postcode"].isin(priority_list_eco3)
+ )
+ )
+
+ stonewater_cavity_properties["UPRN"] = stonewater_cavity_properties["UPRN"].astype("Int64").astype(str)
+ # Find the postcodes where an Osmosis survey revealed a need for CWI
+ postcodes_found_needing_cwi = stonewater_cavity_properties[
+ stonewater_cavity_properties["Reason Included"].isin(
+ [
+ "Survey revealed potential need for CWI or extract and re-fill",
+ "Surveyed revealed potential need for CWI or extract and re-fill and is an as built cavity property",
+ "Survey showed this property needs CWI",
+ "Survey showed this property could need extract and re-fill"
+ ]
+ )
+ ]["Postcode"].unique()
+
+ stonewater_cavity_properties["Suspected Needs CWI - not surveyed"] = (
+ (
+ stonewater_cavity_properties[
+ "Postcode"].isin(
+ postcodes_found_needing_cwi)
+ ) & (
+ ~stonewater_cavity_properties[
+ "Reason Included"].isin(
+ [
+ "Survey revealed potential need "
+ "for CWI or extract and re-fill",
+ "Surveyed revealed potential "
+ "need for CWI or extract and "
+ "re-fill and is an as built "
+ "cavity property",
+ "Survey showed this property "
+ "needs CWI",
+ "Survey showed this property "
+ "could need extract and re-fill"
+ ]
+ )
+ )
+ )
+
+ # Merge the EPCs on, with the data we need
+ stonewater_cavity_properties = stonewater_cavity_properties.rename(
+ columns={
+ "Age": "Parity - Build Age",
+ "Property Type": "Parity - Property Type",
+ "Walls": "Parity - Wall Construction",
+ "Roofs": "Parity - Roof Construction",
+ "Glazing": "Parity - Glazing Type",
+ "Heating": "Parity - Heating Type",
+ "Main Fuel": "Parity - Main Fuel",
+ "Hot Water": "Parity - Hot Water",
+ "Renewables": "Parity - Renewables",
+ "Total Floor Area": "Parity - Total Floor Area"
+ }
+ )
+
+ # We now flag the additional properties in the as built list
+
+ additional_properties = features[
+ ~features["Address ID"].isin(archetyped_properties["Address ID"].values)
+ ]
+
+ # Filter on as built cavity properties
+ additional_properties = additional_properties[
+ additional_properties["Walls"].isin(cavity_descriptions)
+ ]
+ additional_properties["Full Address"] = additional_properties["Address"].copy()
+ house_numbers = []
+ for _, x in tqdm(additional_properties.iterrows(), total=len(additional_properties)):
+ house_no = SearchEpc.get_house_number(x["Address"].split(",")[0], x["Postcode"])
+ if house_no is None:
+ house_no = x["Address"].split(",")[0]
+ # If we end up with a number like "01" we need to remove the leading zero
+ house_no = house_no.lstrip("0")
+ house_numbers.append(
+ {
+ "Address ID": x["Address ID"],
+ "Number": house_no
+ }
+ )
+
+ house_numbers = pd.DataFrame(house_numbers)
+ additional_properties = additional_properties.merge(house_numbers, how="left", on="Address ID")
+ additional_properties["row_id"] = additional_properties["Address ID"].copy()
+
+ # Flag any units in this list that were installed under ECO3
+ additional_properties["Installed under ECO3"] = additional_properties["Organisation Reference"].isin(
+ numeric_ids['STONEWATER UPRN'].values
+ )
+
+ # Additional list ECO3
+ additional_list_eco3 = additional_properties[additional_properties["Installed under ECO3"]]["Postcode"].unique()
+
+ # These are properties that were not installed under ECO3, that have the same postcodes as properties
+ # installed under ECO3
+ # These are 297 properties we might want to start with as an immediate priority
+ additional_properties["Same Postcode as Installed under ECO3"] = (
+ ~additional_properties["Installed under ECO3"] & (
+ additional_properties["Postcode"].isin(additional_list_eco3)
+ )
+ )
+
+ # We do some additional manual checks, for ECO3 properties that were installed that didn't get matched to either
+ # dataaset
+ numeric_ids["In asset list"] = numeric_ids["STONEWATER UPRN"].isin(
+ stonewater_cavity_properties['Org. ref.'].astype(int).values
+ )
+ numeric_ids["In asset list"] = numeric_ids["In asset list"] | (
+ numeric_ids["STONEWATER UPRN"].isin(
+ additional_properties['Organisation Reference'].astype(int).values
+ )
+ )
+
+ # eco3_installs_not_in_asset_list = numeric_ids[~numeric_ids["In asset list"]]
+ # # We now take samples of properties randomly and manually check the ID against the asset list
+ # print(eco3_installs_not_in_asset_list.sample(1)[["STONEWATER UPRN", "Post Code", "NO ", "Street / Block Name", ]])
+ # # Checked STONEWATER UPRN
+ # # 9862, BH15 1NR, 33, THE QUAY FOYER [x]
+ # # 12785, S01 66PN, 57, SEACOLE GARDENS [x]
+ # # 26071, MK42 0TE, 51, De Havilland Avenue, Shortstown [x]
+ # # 18213, HR6 9UW, 20 Ford Street [x]
+ # # 24344, LU4 9FF, 6 SEAL CLOSE [x]
+ # # 31222, SN14 0QZ, 7 HARDBROOK COURT [x]
+ # # 9343, SP4 7XL, 10 OAK PLACE [x]
+ # # 34730, LU5 5TN, 4 TUDOR DRIVE [x]
+ # # 7021, BN27 2BZ, 32 BUTTS FIELD []
+ #
+ # stonewater_cavity_properties[stonewater_cavity_properties['Org. ref.'] == 7021]
+ # stonewater_cavity_properties[stonewater_cavity_properties['Postcode'] == "BN27 2BZ"]["Name"]
+ #
+ # additional_properties[additional_properties['Organisation Reference'] == 7021]
+ # additional_properties[additional_properties['Postcode'] == "BN27 2BZ"][["Address"]]
+
+ # Pull the EPCs for these properties
+ # additional_properties_epcs, errors = get_data(additional_properties)
+
+ # Save this data as a pickle
+ # import pickle
+ # with open("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/additional_properties_epcs.pkl",
+ # "wb") as f:
+ # pickle.dump(additional_properties_epcs, f)
+
+ additional_properties["Suspected Needs CWI - not surveyed"] = (
+ (
+ additional_properties["Postcode"].isin(postcodes_found_needing_cwi) &
+ ~additional_properties["Installed under ECO3"]
+ )
+ )
+
+ # We drop Full Address
+ additional_properties = additional_properties.drop(columns=["Full Address"])
+ additional_properties2 = additional_properties[[
+ "Address", "Postcode", "Address ID", "SAP", "SAP Band", "Property Type", "Walls", "Roofs", "Glazing",
+ "Heating", "Main Fuel", "Hot Water", "Renewables", "Total Floor Area", 'Installed under ECO3',
+ 'Same Postcode as Installed under ECO3', "Organisation Reference",
+ ]].rename(
+ columns={
+ "Organisation Reference": "Org. ref.",
+ "SAP": "Parity - Predicted SAP",
+ "SAP Band": "Parity - Predicted SAP Band",
+ "Age": "Parity - Build Age",
+ "Property Type": "Parity - Property Type",
+ "Walls": "Parity - Wall Construction",
+ "Roofs": "Parity - Roof Construction",
+ "Glazing": "Parity - Glazing Type",
+ "Heating": "Parity - Heating Type",
+ "Main Fuel": "Parity - Main Fuel",
+ "Hot Water": "Parity - Hot Water",
+ "Renewables": "Parity - Renewables",
+ "Total Floor Area": "Parity - Total Floor Area"
+ }
+ )
+
+ # Combine the data:
+
+ stonewater_cavity_properties2 = stonewater_cavity_properties.merge(
+ features[["Address", "Organisation Reference"]], how="left", on="Organisation Reference"
+ )
+ full_dataset = pd.concat([stonewater_cavity_properties2, additional_properties2])
+ full_dataset = full_dataset.drop(columns=['Osm. ID'])
+
+ # We not define the priority list for non-intrusives
+ full_dataset["Postal Region"] = full_dataset["Postcode"].str.split(" ").str[0].str[0:2]
+ full_dataset["Postal Region 2"] = full_dataset["Postcode"].str.split(" ").str[0]
+
+ # Strip out anything we definitely don't want
+ full_dataset = full_dataset[~full_dataset["Installed under ECO3"]]
+
+ areas = full_dataset[full_dataset["Suspected Needs CWI - not surveyed"] == True]["Postal Region 2"].unique()
+
+ priorities = full_dataset[
+ full_dataset["Postal Region 2"].isin(areas)
+ ]
+
+ region_prevalance = priorities["Postal Region 2"].value_counts().to_frame().reset_index()
+ region_prevalance = region_prevalance[region_prevalance["count"] > 100]
+ df = priorities[priorities["Postal Region 2"].isin(region_prevalance["Postal Region 2"].values)]
+
+ df["Postal Region"].value_counts()
+ df["Postal Region 2"].value_counts()
+
+ if df["Installed under ECO3"].sum():
+ raise ValueError("There are properties in the priority list that were installed under ECO3")
+
+ df.to_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives - "
+ "revised list.csv",
+ index=False
+ )
+
+ # We save the data locally
+ # stonewater_cavity_properties.to_csv(
+ # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties - priority "
+ # "postcodes.csv",
+ # index=False
+ # )
+ # additional_properties2.to_csv(
+ # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties - "
+ # "non-priority postcodes.csv",
+ # index=False
+ # )
+ # # Save the survey findings
+ # needs_cwi.to_csv(
+ # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI -
+ # WIP.csv",
+ # index=False
+ # )
+
+
+def cross_reference_epc_programme():
+ eco3_fallout = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/STONEWATER LIST OF ADDRESSES TO BE "
+ "SURVEYED - ECO3 NOT COMPLETED.xlsx"
+ )
+
+ for _, x in eco3_fallout.iterrows():
+ house_no = SearchEpc.get_house_number(x["ADDRESS"], "")
+ if house_no is None:
+ house_no = x["ADDRESS"].split(",")[0]
+ x["house_number"] = house_no
+
+ eco3_fallout["house_number"] = eco3_fallout.apply(
+ lambda x: SearchEpc.get_house_number(x["ADDRESS"], ""), axis=1
+ )
+
+ # for _, x in eco3_fallout.ite
+
+ stonewater_modelled_above_c = pd.read_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - "
+ "master sheet.csv",
+ encoding='latin1'
+ )
+
+ stonewater_modelled_above_c["house_number"] = stonewater_modelled_above_c.apply(
+ lambda x: SearchEpc.get_house_number(x["Address"], x["Postcode"]), axis=1
+ )
+
+ eco3_fallout_matched_to_above_c = []
+ for _, property in eco3_fallout.iterrows():
+ # Match on house number
+ match = stonewater_modelled_above_c[
+ stonewater_modelled_above_c["house_number"] == property["house_number"]
+ ]
+
+ # We do a fuzzy match on the address, with levenstein distance
+
+ from fuzzywuzzy import fuzz
+ match = stonewater_modelled_above_c[
+ stonewater_modelled_above_c["Address"].apply(lambda x: fuzz.ratio(x, property["ADDRESS"]) > 90)
+ ]
+ match.head()
+
+
+def finalise_list_for_non_intrusives():
+ non_intrusives_list = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/20250207 Stonewater "
+ "Non-Intrusives.xlsx"
+ )
+
+ # Remove anything installed under ECO3
+ non_intrusives_list = non_intrusives_list[~non_intrusives_list["Installed under ECO3"]]
+
+ # We make any properties that were surveyed by Osmosis
+ packages = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Downloads/Stonewater - Bid Packages WIP 14.11.20 V2 "
+ "(1).xlsx",
+ header=13,
+ sheet_name="Modelled Packages"
+ )
+
+ non_intrusives_list["Surveyed by Osmosis"] = non_intrusives_list["Address ID"].isin(
+ packages["Address ID"].values
+ )
+ # Removed 54 addresses
+ final_non_intrusives = non_intrusives_list[
+ ~non_intrusives_list["Surveyed by Osmosis"]
+ ]
+
+ features = pd.read_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - "
+ "master sheet.csv",
+ encoding='latin1'
+ )
+
+ # Add on the orgnisaion reference
+ final_non_intrusives = final_non_intrusives.merge(
+ features[["Organisation Reference", "Address ID"]],
+ how="left",
+ on="Address ID"
+ )
+
+ final_non_intrusives["Postal Region"] = final_non_intrusives["Postcode"].str.split(" ").str[0].str[0:2]
+ selected_regions = final_non_intrusives[
+ final_non_intrusives["Include in non-intrusives"]
+ ]["Postcode"].unique()
+
+ final_non_intrusives["Is in region"] = final_non_intrusives["Postcode"].isin(selected_regions)
+
+ # Filter down:
+ final_non_intrusives = final_non_intrusives[
+ final_non_intrusives["Is in region"]
+ ]
+
+ final_non_intrusives.to_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives "
+ "List - final.xlsx")
diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
new file mode 100644
index 00000000..09ba20bd
--- /dev/null
+++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
@@ -0,0 +1,11 @@
+PyPDF2
+pandas
+tqdm
+openpyxl
+boto3
+epc-api-python==1.0.2
+usaddress==0.5.11
+fuzzywuzzy==0.18.0
+python-dotenv
+scipy
+
diff --git a/etl/customers/united living/get_data.py b/etl/customers/united living/get_data.py
new file mode 100644
index 00000000..bc4ab400
--- /dev/null
+++ b/etl/customers/united living/get_data.py
@@ -0,0 +1,73 @@
+import os
+import pandas as pd
+import numpy as np
+from asset_list.utils import get_data
+from backend.SearchEpc import SearchEpc
+from etl.spatial.OpenUprnClient import OpenUprnClient
+
+from dotenv import load_dotenv
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+
+def app():
+ filepath = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/United Living/Potential GMCA props 05.03.xlsx"
+
+ df = pd.read_excel(filepath)
+ df["row_id"] = df.index
+
+ df["house_number"] = df.apply(
+ lambda x: SearchEpc.get_house_number(x["Address"], x["Postcode"]),
+ axis=1
+ )
+
+ properties_data, _, _ = get_data(
+ df=df,
+ manual_uprn_map={},
+ epc_auth_token=EPC_AUTH_TOKEN,
+ uprn_column=None,
+ fulladdress_column="Address",
+ address1_column="house_number",
+ postcode_column="Postcode",
+ property_type_column=None,
+ built_form_column=None,
+ epc_api_only=True,
+ row_id_name="row_id",
+ )
+
+ no_data = df[df["row_id"].isin(_)]
+ no_data[["Address", "Postcode"]]
+
+ # 53 108 Alexandra Street OL6 9QP 100011536830
+ # 56 301 Whiteacre Road OL6 9QF 100011557437
+ # 65 97 Princess Street OL6 9QJ 100011551813
+
+ data = df.merge(
+ pd.DataFrame(properties_data)[["uprn", "row_id"]],
+ how="left", left_on="row_id", right_on="row_id"
+ )
+
+ # Fill missing UPRNS
+ data["uprn"] = np.where(data["Address"] == "108 Alexandra Street", 100011536830, data["uprn"])
+ data["uprn"] = np.where(data["Address"] == "301 Whiteacre Road", 100011557437, data["uprn"])
+ data["uprn"] = np.where(data["Address"] == "97 Princess Street", 100011551813, data["uprn"])
+
+ # We now get whether the property is listed, heritage or in a conservation area
+ spatial_data = OpenUprnClient.get_spatial_data(uprns=data["uprn"].tolist(), bucket_name="retrofit-data-dev")
+ spatial_data = spatial_data.rename(columns={"UPRN": "uprn"})
+
+ data["uprn"] = data["uprn"].astype(int)
+
+ merged = data.merge(
+ spatial_data, how="left", on="uprn"
+ )
+ # fill NAs
+ for c in ['conservation_status', 'is_listed_building', 'is_heritage_building']:
+ merged[c] = merged[c].fillna(False)
+
+ merged.to_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/United Living/Potential GMCA props 05.03 - data "
+ "pulled.xlsx",
+ index=False
+ )
diff --git a/etl/customers/waltham_forest/whlg eligibile properties.py b/etl/customers/waltham_forest/whlg eligibile properties.py
new file mode 100644
index 00000000..9e1949f7
--- /dev/null
+++ b/etl/customers/waltham_forest/whlg eligibile properties.py
@@ -0,0 +1,85 @@
+"""
+This is the list of properties, based on the EPC data, that look eligible for WHLG
+"""
+import pandas as pd
+from etl.epc.settings import EARLIEST_EPC_DATE
+from etl.spatial.OpenUprnClient import OpenUprnClient
+
+epc_data = pd.read_csv(
+ "/Users/khalimconn-kowlessar/Downloads/all-domestic-certificates/domestic-E09000031-Waltham-Forest/certificates.csv"
+)
+epc_data.columns = [c.replace("_", "-").lower() for c in epc_data.columns]
+epc_data = epc_data[epc_data["lodgement-date"] >= EARLIEST_EPC_DATE]
+
+epc_data = epc_data[~pd.isnull(epc_data["uprn"])]
+epc_data["uprn"] = epc_data["uprn"].astype(int)
+
+epc_data = epc_data[epc_data["current-energy-rating"].isin(["D", "E", "F", "G"])]
+epc_data = epc_data[epc_data["tenure"].isin(
+ ["rental (private)", "Rented (private)", "owner-occupied", "Owner-occupied"])
+]
+
+whlg_eligible_postcodes = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Downloads/WHLG-eligible-postcodes.xlsx",
+ sheet_name="Eligible postcodes",
+ header=1
+)
+# Format:
+whlg_eligible_postcodes = whlg_eligible_postcodes[['Postcode', 'Local Authority']]
+
+uprns = epc_data["uprn"].unique()
+# Get data
+ca_data = OpenUprnClient.get_spatial_data(uprns, "retrofit-data-dev")
+epc_data = epc_data.merge(
+ ca_data[["UPRN", "conservation_status", "is_listed_building", "is_heritage_building"]].rename(
+ columns={"UPRN": "uprn"}
+ ),
+ how="left",
+ on="uprn",
+)
+
+epc_data["has_conservation_restrictions"] = (
+ (epc_data["conservation_status"] == True)
+ | (epc_data["is_listed_building"] == True)
+ | (epc_data["is_heritage_building"] == True)
+)
+
+whlg_eligible_postcodes["Local Authority"].value_counts()
+
+whlg_eligible_postcodes = whlg_eligible_postcodes[whlg_eligible_postcodes["Local Authority"] == "Waltham Forest"]
+
+# Pathway 1:
+# Match based on eligible postcodes
+pathway1 = epc_data[epc_data["postcode"].isin(whlg_eligible_postcodes["Postcode"].values)]
+pathway1 = pathway1[
+ [
+ "uprn", "address", "address1", "postcode", "current-energy-rating", "current-energy-efficiency",
+ "lodgement-date",
+ "has_conservation_restrictions", "walls-description", "roof-description", "mainheat-description"
+ ]
+]
+
+pathway1 = pathway1.rename(
+ columns={
+ "current-energy-rating": "EPC Rating", "current-energy-efficiency": "SAP Score",
+ "lodgement-date": "EPC Date", "has_conservation_restrictions": "Conservation Area Restrictions",
+ "walls-description": "Wall Type", "roof-description": "Roof Type", "mainheat-description": "Main Heating"
+ }
+)
+
+pathway1["EPC Date"] = pd.to_datetime(pathway1["EPC Date"]).dt.strftime("%Y-%m-%d")
+# Create a year EPC was lodged
+pathway1["EPC Year"] = pd.to_datetime(pathway1["EPC Date"]).dt.year
+
+low_epc = pathway1[pathway1["EPC Rating"].isin(["F", "G"])]
+low_epc["EPC Rating"].value_counts()
+low_epc.tail(1)[["address", "postcode"]]
+
+pathway1.to_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Waltham Forest WHLG - Pathway 1 Eligibility.csv",
+ index=False
+)
+
+# Pathway 2 or 3
+# The household will need to be means tested
+pathway2 = epc_data[~epc_data["uprn"].isin(pathway1["uprn"].values)]
diff --git a/etl/customers/warwick/remote_assessments.py b/etl/customers/warwick/remote_assessments.py
new file mode 100644
index 00000000..a9b654b7
--- /dev/null
+++ b/etl/customers/warwick/remote_assessments.py
@@ -0,0 +1,123 @@
+import pandas as pd
+from utils.s3 import save_csv_to_s3
+
+PORTFOLIO_ID = 115
+USER_ID = 8
+
+
+def app():
+ """
+ Used to set up the remote assessments for Warwick
+ """
+
+ asset_list = [
+ {
+ "uprn": 10033604792,
+ "address": "Flat 2, 3 Green Street",
+ "postcode": "W1K 6RN"
+ },
+ {
+ "uprn": 10033604794,
+ "address": "Flat 4, 3 Green Street",
+ "postcode": "W1K 6RN"
+ },
+ {
+ "uprn": 10033615515,
+ "address": "Apartment 4, 52 Green Street",
+ "postcode": "W1K 6RS"
+ }
+ ]
+ asset_list = pd.DataFrame(asset_list)
+
+ # Store the asset list in s3
+ filename = f"{USER_ID}/{PORTFOLIO_ID}/asset_list.csv"
+ save_csv_to_s3(
+ dataframe=asset_list,
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=filename
+ )
+
+ non_invasive_recommendations = [
+ {
+ "uprn": 10033604792,
+ "recommendations": [
+ {
+ "type": "internal_wall_insulation",
+ "sap_points": 16,
+ "survey": True
+ }
+ ]
+ },
+ {
+ "uprn": 10033604794,
+ "recommendations": [
+ {
+ "type": "internal_wall_insulation",
+ "sap_points": 14,
+ "survey": True
+ }
+ ]
+ },
+ {
+ "uprn": 10033615515,
+ "recommendations": [
+ {
+ "type": "room_roof_insulation",
+ "sap_points": 12,
+ "survey": True
+ },
+ {
+ "type": "internal_wall_insulation",
+ "sap_points": 2,
+ "survey": True
+ }
+ ]
+ }
+ ]
+
+ # Store non-invasive recommendations in S3
+ non_invasive_recommendations_filename = f"{USER_ID}/{PORTFOLIO_ID}/non_invasive_recommendations.csv"
+ save_csv_to_s3(
+ dataframe=pd.DataFrame(non_invasive_recommendations),
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=non_invasive_recommendations_filename
+ )
+
+ valuation_data = [
+ {
+ "uprn": 10033604792,
+ "value": 3_692_000
+ },
+ {
+ "uprn": 10033604794,
+ "value": 3_789_000
+ },
+ {
+ "uprn": 10033615515,
+ "value": 3_499_000
+ }
+ ]
+
+ # Store valuation data to s3
+ valuation_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuation.csv"
+ save_csv_to_s3(
+ dataframe=pd.DataFrame(valuation_data),
+ bucket_name="retrofit-plan-inputs-dev",
+ file_name=valuation_filename
+ )
+
+ body = {
+ "portfolio_id": str(PORTFOLIO_ID),
+ "housing_type": "Private",
+ "goal": "Increasing EPC",
+ "goal_value": "C",
+ "trigger_file_path": filename,
+ "already_installed_file_path": "",
+ "patches_file_path": "",
+ "non_invasive_recommendations_file_path": non_invasive_recommendations_filename,
+ "valuation_file_path": valuation_filename,
+ "scenario_name": "Full package remote assessment",
+ "multi_plan": True,
+ "budget": None,
+ }
+ print(body)
diff --git a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
index aca36584..76087a76 100644
--- a/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
+++ b/etl/eligibility/ha_15_32/ha_analysis_batch_3.py
@@ -1,7 +1,7 @@
import os
import re
import openpyxl
-import Levenshtein
+from fuzzywuzzy import fuzz
from pathlib import Path
import msgpack
from datetime import datetime
@@ -2771,7 +2771,8 @@ class DataLoader:
match_to = [x.replace(" ", "") for x in match_to]
# Perform matching between full key and match_to
- distances = [Levenshtein.distance(matching_string, s) for s in match_to]
+ distances = [100 - fuzz.ratio(matching_string, s) for s in match_to]
+
best_match_index = distances.index(min(distances))
# We might want to consider a threshold for the distance, however for the momeny,
# we don't consider this for the moment
@@ -2897,6 +2898,17 @@ class DataLoader:
# Merge onto the survey list
survey_list = survey_list.merge(matching_lookup, how='left', on="survey_list_row_id")
+ # TEMP FOR NEWER WORK
+ # matching_lookup = matching_lookup.merge(
+ # asset_list[["asset_list_row_id", "UPRN"]], how="left", on="asset_list_row_id"
+ # ).merge(
+ # survey_list[["survey_list_row_id", "NO.", "Street / Block Name", "Post Code"]],
+ # how="left", on="survey_list_row_id"
+ # )
+ # matching_lookup.to_csv(
+ # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Plus Dane/surveys_to_assets.csv"
+ # )
+
return survey_list
@staticmethod
diff --git a/etl/epc/Dataset.py b/etl/epc/Dataset.py
index 3f2e810e..83a85b78 100644
--- a/etl/epc/Dataset.py
+++ b/etl/epc/Dataset.py
@@ -203,11 +203,11 @@ class TrainingDataset(BaseDataset):
common_cols = [[col + "_starting", col + "_ending"] for col in common_cols]
self.df = self.df.loc[
- :,
- no_suffix_cols
- + only_ending_cols
- + [col for cols in common_cols for col in cols],
- ]
+ :,
+ no_suffix_cols
+ + only_ending_cols
+ + [col for cols in common_cols for col in cols],
+ ]
def _remove_abnormal_change_in_floor_area(self):
"""
@@ -511,7 +511,7 @@ class TrainingDataset(BaseDataset):
expanded_df["is_sandstone_or_limestone"]
== expanded_df["is_sandstone_or_limestone_ending"]
)
- ]
+ ]
elif component == "floor":
expanded_df = expanded_df[
(expanded_df["is_suspended"] == expanded_df["is_suspended_ending"])
@@ -528,7 +528,7 @@ class TrainingDataset(BaseDataset):
expanded_df["is_to_external_air"]
== expanded_df["is_to_external_air_ending"]
)
- ]
+ ]
elif component == "roof":
expanded_df = expanded_df[
(expanded_df["is_pitched"] == expanded_df["is_pitched_ending"])
@@ -541,7 +541,7 @@ class TrainingDataset(BaseDataset):
expanded_df["has_dwelling_above"]
== expanded_df["has_dwelling_above_ending"]
)
- ]
+ ]
return expanded_df
diff --git a/etl/epc/Record.py b/etl/epc/Record.py
index 4c1a912b..9ff1de0a 100644
--- a/etl/epc/Record.py
+++ b/etl/epc/Record.py
@@ -139,28 +139,22 @@ class EPCRecord:
self._clean_records_using_epc_records()
self._clean_with_data_processor()
-
self._expand_prepared_epc_to_attributes()
-
self._identify_delta_between_prepared_and_original_records()
# Process to create uvalues for the single epc record
-
- # selff.df = self.epc_record_as_dataframe('prepared_epc')
-
+ # self.df = self.epc_record_as_dataframe('prepared_epc')
# self._feature_generation()
# self._drop_features()
return
- self._expand_description_to_features()
- self._expand_description_to_uvalues()
-
+ # self._expand_description_to_features()
+ # self._expand_description_to_uvalues()
+ #
# self._generate_uvalues()
# self._validate_expanded_description()
# self._validate_u_values()
- # etc
- pass
def _drop_features(self):
"""
@@ -359,6 +353,8 @@ class EPCRecord:
self._clean_property_dimensions()
self._clean_number_lighting_outlets()
self._clean_floor_level()
+ self._clean_floor_height()
+ self._clean_constituency()
# self._clean_potential_energy_efficiency()
# self._clean_environment_impact_potential()
@@ -387,6 +383,31 @@ class EPCRecord:
return df
+ def _clean_floor_height(self):
+ """ Remaps anomalies in floor height to the average floor height for the property type """
+ floor_height_data = self.cleaning_data[
+ (self.cleaning_data["property_type"] == self.prepared_epc["property-type"]) &
+ (self.cleaning_data["built_form"] == self.prepared_epc["built-form"])
+ ]
+ average = floor_height_data["floor_height"].mean()
+ sd = floor_height_data["floor_height"].std()
+ # If we're in the top 0.5 percentile of floor heights, we'll set it to the average
+ if self.prepared_epc["floor-height"] > average + 10 * sd:
+ self.prepared_epc["floor-height"] = average
+ if self.prepared_epc["floor-height"] <= 1.665:
+ self.prepared_epc["floor-height"] = average
+
+ def _clean_constituency(self):
+ """
+ We handle the single case of finding a missing constituency by using the local authority
+ """
+ if pd.isnull(self.prepared_epc["constituency"]) or (self.prepared_epc["constituency"] == ""):
+ if self.prepared_epc["local-authority"] != "E06000044":
+ raise NotImplementedError(
+ "This function is only implemented for Portsmouth, in the single edgecase seen"
+ )
+ self.prepared_epc["constituency"] = "E14000883"
+
def _clean_floor_level(self):
"""
This method will clean the floor level, if empty or invalid
diff --git a/etl/find_my_epc/AssetListEpcData.py b/etl/find_my_epc/AssetListEpcData.py
new file mode 100644
index 00000000..f085c8fb
--- /dev/null
+++ b/etl/find_my_epc/AssetListEpcData.py
@@ -0,0 +1,133 @@
+import time
+import pandas as pd
+from tqdm import tqdm
+from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
+from backend.SearchEpc import SearchEpc
+from utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+class AssetListEpcData:
+
+ def __init__(self, asset_list: pd.DataFrame, epc_auth_token: str):
+
+ """
+ This class handles pulling data assocaited to an asset list and performs common functions like
+ getting EPC api data, retrieveing data form the find my epc website and extracting non-intrusive
+ recommendations
+ :param asset_list:
+ """
+
+ # Check the asset list contains the correct columns
+
+ self.asset_list = self.check_asset_list(asset_list)
+ self.epc_auth_token = epc_auth_token
+
+ self.extracted_data = None
+ self.non_invasive_recommendations = None
+ self.patches = None
+
+ @staticmethod
+ def check_asset_list(asset_list):
+ # TODO: Update this with pydantic
+
+ return asset_list
+
+ def get_non_invasive_recommendations(self):
+
+ """
+ Extracts non-invasive recommendations in a format that can be used by the engine
+ :return:
+ """
+
+ if self.extracted_data is None:
+ raise ValueError("Please run get_data first")
+
+ self.non_invasive_recommendations = [
+ {
+ "uprn": r.get("uprn"),
+ "address": r["address"],
+ "postcode": r["postcode"],
+ "recommendations": r["recommendations"]
+ } for r in self.extracted_data
+ ]
+
+ def get_patch(self):
+ """
+
+ :return:
+ """
+ if self.extracted_data is None:
+ raise ValueError("extracted data is missing - run get_data first")
+
+ self.patches = [
+ {
+ "uprn": r.get("uprn"),
+ **r.get("patch")
+ } for r in self.extracted_data if r.get("patch")
+ ]
+
+ def get_data(self):
+
+ logger.info("Retrieving data for given asset list")
+
+ # Pull the additional data
+ extracted_data = []
+ for _, home in tqdm(self.asset_list.iterrows(), total=len(self.asset_list)):
+ add1 = home["address"]
+ pc = home["postcode"]
+ # Retrieve the EPC data
+ epc_searcher = SearchEpc(
+ address1=add1,
+ postcode=pc,
+ uprn=home.get("uprn"),
+ auth_token=self.epc_auth_token,
+ os_api_key="",
+ )
+ epc_searcher.ordnance_survey_client.property_type = home.get("property_type")
+ epc_searcher.ordnance_survey_client.built_form = home.get("built_form")
+ epc_searcher.find_property(skip_os=True)
+
+ if epc_searcher.newest_epc is None:
+ continue
+
+ if not pd.isnull(home.get("patch")):
+ epc_searcher.newest_epc["address1"] = add1
+
+ # Attempt both methods:
+ try:
+ find_epc_searcher = RetrieveFindMyEpc(
+ address=epc_searcher.newest_epc["address1"] + ", " + epc_searcher.newest_epc["address2"],
+ postcode=epc_searcher.newest_epc["postcode"]
+ )
+ find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+ except Exception as e:
+ logger.error(f"Error retrieving find my epc data: {e}")
+ find_epc_searcher = RetrieveFindMyEpc(
+ address=epc_searcher.newest_epc["address1"],
+ postcode=epc_searcher.newest_epc["postcode"]
+ )
+ find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+ time.sleep(0.5)
+ # We need uprn
+
+ to_append = {
+ "uprn": home.get("uprn"),
+ "address": home["address"],
+ "postcode": home["postcode"],
+ **find_epc_data,
+ }
+ if not pd.isnull(home.get("patch")):
+ to_append["patch"] = {
+ "current-energy-rating": find_epc_data["current_epc_rating"],
+ "current-energy-efficiency": find_epc_data["current_epc_efficiency"],
+ "potential-energy-rating": find_epc_data["potential_epc_rating"],
+ "potential-energy-efficiency": find_epc_data["potential_epc_efficiency"],
+ **find_epc_data["epc_data"]
+ }
+
+ extracted_data.append(to_append)
+
+ self.extracted_data = extracted_data
+ logger.info("Data Extrction complete")
diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py
new file mode 100644
index 00000000..86c3fda1
--- /dev/null
+++ b/etl/find_my_epc/RetrieveFindMyEpc.py
@@ -0,0 +1,480 @@
+import re
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from datetime import datetime
+
+from utils.logger import setup_logger
+
+logger = setup_logger()
+
+
+class RetrieveFindMyEpc:
+ SEARCH_POSTCODE_URL = (
+ "https://find-energy-certificate.service.gov.uk/find-a-certificate/search-by-postcode?postcode={postcode_input}"
+ )
+ BASE_ENERGY_URL = "https://find-energy-certificate.service.gov.uk"
+
+ HEADERS = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
+ 'Chrome/111.0.0.0 Safari/537.36'
+ }
+
+ def __init__(self, address: str, postcode: str):
+ """
+ This class is tasked with retrieving the latest EPC data from the find my epc website
+ :param address: The address of the property
+ :param postcode: The postcode of the property
+ """
+ self.address = address
+ self.postcode = postcode
+
+ self.address_cleaned = self.address.replace(",", "").replace(" ", "").lower()
+ self.walls = []
+
+ @staticmethod
+ def extract_low_carbon_sources(soup):
+ # Find the section header
+ section_header = soup.find("h3", string="Low and zero carbon energy sources")
+ if not section_header:
+ return {}
+
+ # Locate the list following the header
+ energy_list = section_header.find_next("ul")
+
+ # Extract the list items
+ sources = {item.get_text(strip=True): True for item in energy_list.find_all("li")}
+ return sources
+
+ @staticmethod
+ def get_text(elem):
+ return elem.get_text(strip=True) if elem else None
+
+ def extract_epc_data(self, soup):
+
+ results = {}
+
+ # 1. Total floor area
+ results['total-floor-area'] = int(self.get_text(
+ soup.find("dt", string="Total floor area").find_next_sibling("dd")
+ ).split(" ")[0])
+
+ # Table with features
+ rows = soup.select("table.govuk-table tbody tr")
+
+ rating_map = {
+ "Very poor": "Very Poor",
+ "Very good": "Very Good"
+ }
+
+ def get_feature_row_text(feature_name, index=0):
+ matches = [row for row in rows if row.find("th") and feature_name in row.find("th").text]
+ if len(matches) > index:
+ cells = matches[index].find_all("td")
+ description = self.get_text(cells[0])
+ rating = self.get_text(cells[1])
+ return description, rating_map.get(rating, rating)
+ return None, None
+
+ # 2-3. First wall description and rating
+ results['walls-description'], results['walls-energy-eff'] = get_feature_row_text("Wall", 0)
+
+ # 4-5. First roof description and rating
+ results['roof-description'], results['roof-energy-eff'] = get_feature_row_text("Roof", 0)
+
+ # 6-7. Windows description and rating
+ results['windows-description'], results['windows-energy-eff'] = get_feature_row_text("Window")
+
+ # 8-9. Main heating description and rating
+ results['mainheat-description'], results['mainheat-energy-eff'] = get_feature_row_text("Main heating")
+
+ # 10-11. Main heating control description and rating
+ results['mainheatcont-description'], results['mainheatc-energy-eff'] = get_feature_row_text(
+ "Main heating control"
+ )
+
+ # 12-13. Hot water description and rating
+ results['hotwater-description'], results['hot-water-energy-ef'] = get_feature_row_text("Hot water")
+
+ # 14-15. Lighting description and rating
+ results['lighting-description'], results['lighting-energy-eff'] = get_feature_row_text("Lighting")
+
+ # 16. Floor description
+ results['floor-description'], _ = get_feature_row_text("Floor")
+
+ # 17. Secondary heating description
+ results['secondheat-description'], _ = get_feature_row_text("Secondary heating")
+
+ # 18. Primary energy use
+ p_energy = soup.find(string=lambda t: "primary energy use for this property per year" in t.lower())
+ # We should always have this
+ match = re.search(r"(\d+)\s+kilowatt", p_energy)
+ results['energy-consumption-current'] = int(match.group(1)) if match else None
+
+ # 19. Current CO2 emissions
+ co2_now = soup.find("dd", id="eir-property-produces")
+ # We should always have this
+ match = re.search(r"([\d.]+)", co2_now.text)
+ results['co2-emissions-current'] = float(match.group(1)) if match else None
+ # Need co2-emiss-curr-per-floor-area
+
+ # 20. Potential CO2 emissions
+ co2_pot = soup.find("dd", id="eir-potential-production")
+ match = re.search(r"([\d.]+)", co2_pot.text)
+ results['co2-emissions-potential'] = float(match.group(1)) if match else None
+
+ return results
+
+ def retrieve_newest_find_my_epc_data(self, sap_2012_date=None):
+ """
+ For a post code and address, we pull out all the required data from the find my epc website
+ """
+
+ postcode_input = self.postcode.replace(" ", "+")
+ postcode_search = self.SEARCH_POSTCODE_URL.format(postcode_input=postcode_input)
+ postcode_response = requests.get(postcode_search, headers=self.HEADERS)
+
+ postcode_res = BeautifulSoup(postcode_response.text, features="html.parser")
+ rows = postcode_res.find_all('tr', class_='govuk-table__row')
+
+ extracted_table = []
+ for row in rows:
+ # Extract the address and URL
+ address_tag = row.find('a', class_='govuk-link')
+ if address_tag is None:
+ continue
+ extracted_address = None
+ extracted_address_url = None
+ if address_tag:
+ extracted_address = address_tag.text.strip()
+ extracted_address_url = address_tag['href']
+
+ extracted_address_cleaned = extracted_address.replace(",", "").replace(" ", "").lower()
+ if not extracted_address_cleaned.startswith(self.address_cleaned):
+ continue
+
+ # If the address is a match, we can extract the data
+
+ # Extract the expiry date
+ expiry_date_tag = row.find('td', class_='govuk-table__cell date')
+ expiry_date = None
+ if expiry_date_tag is not None:
+ expiry_date = expiry_date_tag.parent.find('span').text.strip()
+
+ extracted_table.append(
+ {
+ "extracted_address": extracted_address,
+ "extracted_address_url": extracted_address_url,
+ "expiry_date": datetime.strptime(expiry_date, '%d %B %Y'),
+ }
+ )
+
+ if not extracted_table:
+ raise ValueError("No EPC found")
+
+ if len(extracted_table) > 1:
+ # We take the one with the most recent expiry date
+ extracted_table = sorted(extracted_table, key=lambda x: x['expiry_date'], reverse=True)
+
+ chosen_epc = self.BASE_ENERGY_URL + extracted_table[0]['extracted_address_url']
+ epc_certificate = chosen_epc.split('/')[-1]
+
+ address_response = requests.get(chosen_epc, headers=self.HEADERS)
+ address_res = BeautifulSoup(address_response.text, features="html.parser")
+
+ # Key data we want to retrieve:
+ # 1) Rating
+ # 2) Bills estimates
+ # 3) Recommendations and SAP points
+ # 4) Low and zero carbon energy sources
+ # 5) The wall types of the property - used for determining if we have an extension wall insulation#
+ # recommendation
+
+ ratings = address_res.find('desc', {'id': 'svg-desc'}).text
+ current_rating = ratings.split(".")[0]
+ potential_rating = ratings.split(".")[1]
+ current_sap = int(current_rating.split(' ')[-1])
+
+ # Floor area
+ address_res.find()
+
+ # Retrieve the energy consumption
+ bills = address_res.find('div', {'id': 'bills-affected'})
+ bills_list = bills.find_all('li')
+ if not bills_list:
+ # If this is the case, it's usually becaue the EPC was very old. Early EPCs did not have this information
+ heating_text = None
+ hot_water_text = None
+ else:
+ heating_text = bills_list[0].text
+ hot_water_text = bills_list[1].text
+
+ # Retrieve the recommendations and SAP points
+ recommendations = []
+ recommendations_div = address_res.find('div', class_='epb-recommended-improvements')
+ if recommendations_div:
+ # Find all h3 headers for each step and extract their related information
+ step_headers = recommendations_div.find_all('h3', class_='govuk-heading-m')
+ previous_sap_score = current_sap
+ previous_epc = current_rating.split(' ')[-6]
+ for step_num, step_header in enumerate(step_headers, start=1):
+ # Extract the step title (the measure)
+ measure_title = step_header.text.strip().replace(f"Step {step_num}: ", "")
+
+ # Find the div containing the potential rating within the same section
+ potential_rating_div = step_header.find_next(
+ 'div', class_='epb-recommended-improvements__potential-rating'
+ )
+
+ # Check if the potential rating div is found
+ if potential_rating_div:
+ # Extract the rating text within the SVG text element
+ extracted_rating_text = potential_rating_div.find('text', class_='govuk-!-font-weight-bold')
+ if extracted_rating_text is not None:
+ rating_text = extracted_rating_text.text.strip()
+ else:
+ rating_text = " ".join([str(previous_sap_score), previous_epc])
+ # Parse the rating text to separate the numeric rating and EPC letter
+ new_rating = int(rating_text.split()[0])
+ new_epc = rating_text.split()[1]
+
+ # Append the information as a dictionary to the recommendations list
+ recommendations.append({
+ "step": step_num,
+ "measure": measure_title,
+ "new_rating": new_rating,
+ "new_epc": new_epc,
+ "sap_points": new_rating - previous_sap_score
+ })
+ previous_sap_score = new_rating
+ previous_epc = new_epc
+
+ # Search for the assessment informaton
+ assessment_information = address_res.find('div', {'id': 'information'})
+ # Parse this information
+ rows = assessment_information.find_all('div', class_='govuk-summary-list__row')
+ # Create a dictionary to hold the parsed information
+ assessment_data = {}
+ for row in rows:
+ key = row.find('dt').text.strip()
+ if key == "Type of assessment":
+ # We dont reliably extract this
+ continue
+ value_tag = row.find('dd')
+
+ # Check if value contains a link (email)
+ if value_tag.find('a'):
+ value = value_tag.find('a').text.strip()
+ elif value_tag.find('summary'):
+ value = value_tag.find('span').text.strip()
+ else:
+ value = value_tag.text.strip()
+
+ # These are keys that we have for both the surveyor and the acreditation scheme. Firstly, we'll
+ # get the surveyor's name and email so we make that information clear
+ if key in ["Telephone", "Email"]:
+ if "Assessor's " + key not in assessment_data:
+ assessment_data["Assessor's " + key] = value
+ else:
+ assessment_data["Accreditation Scheme's " + key] = value
+ continue
+
+ assessment_data[key] = value
+
+ expected_keys = [
+ 'Assessor’s name',
+ "Assessor's Telephone",
+ "Assessor's Email",
+ 'Assessor’s ID',
+ 'Accreditation scheme',
+ 'Assessor’s declaration',
+ "Accreditation Scheme's Telephone",
+ "Accreditation Scheme's Email",
+ 'Date of assessment',
+ 'Date of certificate'
+ ]
+ # Check we have all the expected keys
+ for key in expected_keys:
+ if key not in assessment_data:
+ raise ValueError(f"Missing key: {key}")
+
+ # The wall types of the property
+ property_features_table = address_res.find("tbody", class_="govuk-table__body")
+ property_features_table = property_features_table.find_all("tr")
+
+ # Extract wall types
+ self.walls = []
+ for row in property_features_table:
+ cells = row.find_all("td")
+ if row.find("th").text.strip() == "Wall":
+ self.walls.append(cells[0].text.strip())
+
+ # Finally, we format the recommendations
+ recommendations = self.format_recommendations(recommendations, assessment_data, sap_2012_date)
+
+ # 4) Low and zero carbon energy sources
+ low_carbon_energy_sources = self.extract_low_carbon_sources(address_res)
+
+ # 5) Pull out the EPC data
+ epc_data = self.extract_epc_data(address_res)
+
+ resulting_data = {
+ 'epc_certificate': epc_certificate,
+ 'current_epc_rating': current_rating.split(' ')[-6],
+ 'current_epc_efficiency': current_sap,
+ 'potential_epc_rating': potential_rating.split(' ')[-6],
+ "potential_epc_efficiency": int(potential_rating.split(' ')[-1]),
+ "heating_text": heating_text,
+ "hot_water_text": hot_water_text,
+ "recommendations": recommendations,
+ "epc_data": epc_data,
+ **assessment_data,
+ **low_carbon_energy_sources,
+ }
+
+ return resulting_data
+
+ def format_recommendations(self, recommendations, assessment_data, sap_2012_date=None):
+ """
+ This function converts the recommendations to a format that we can use in the engine as a non-intrusive survey
+ :param recommendations: The recommendations from the EPC
+ :param assessment_data: The assessment data from the EPC
+ :param sap_2012_date: The date of the SAP 2012 update
+ """
+
+ measure_map = {
+ "Internal or external wall insulation": ["internal_wall_insulation", "external_wall_insulation"],
+ "Hot water cylinder insulation": ["hot_water_tank_insulation"],
+ "Hot water cylinder thermostat": ["cylinder_thermostat"],
+ "High performance external doors": ["insulated_doors"],
+ "Floor insulation (solid floor)": ["solid_floor_insulation"],
+ "Floor insulation (suspended floor)": ["suspended_floor_insulation"],
+ "Double glazed windows": ["double_glazing"],
+ "Cavity wall insulation": ["cavity_wall_insulation"],
+ "Replace boiler with new condensing boiler": ["boiler_upgrade"],
+ "Floor insulation": ["floor_insulation"], # Recommendation typically associated to older EPCs
+ "Heating controls (programmer, room thermostat and TRVs)": [
+ "roomstat_programmer_trvs", "time_temperature_zone_control"
+ ],
+ "Low energy lighting": ["low_energy_lighting"],
+ "Increase loft insulation to 270 mm": ["loft_insulation"],
+ "Heating controls (thermostatic radiator valves)": [
+ "roomstat_programmer_trvs", "time_temperature_zone_control"
+ ],
+ "Solar water heating": ["solar_water_heating"],
+ "Solar photovoltaic panels, 2.5 kWp": ["solar_pv"],
+ "Heating controls (room thermostat and TRVs)": [
+ "roomstat_programmer_trvs", "time_temperature_zone_control"
+ ],
+ "Change heating to gas condensing boiler": ["boiler_upgrade"],
+ "Fan assisted storage heaters and dual immersion cylinder": ["high_heat_retention_storage_heater"],
+ "Flat roof or sloping ceiling insulation": ["flat_roof_insulation"],
+ "Heating controls (room thermostat)": [
+ "roomstat_programmer_trvs", "time_temperature_zone_control"
+ ],
+ "Band A condensing boiler": ["boiler_upgrade"],
+ "Double glazing": ["double_glazing"],
+ "Flue gas heat recovery device in conjunction with boiler": ["flue_gas_heat_recovery"],
+ "Wind turbine": ["wind_turbine"],
+ "Loft insulation": ["loft_insulation"],
+ "Solar photovoltaic (PV) panels": ["solar_pv"],
+ "Party wall insulation": ["party_wall_insulation"],
+ 'Draught proofing': ["draught_proofing"],
+ "Roof insulation recommendation": [],
+ "Cavity wall insulation recommendation": [],
+ "Windows draught proofing": [],
+ "Low energy lighting for all fixed outlets": ["low_energy_lighting"],
+ "Cylinder thermostat recommendation": [],
+ "Heating controls recommendation": [],
+ "Replace boiler with Band A condensing boiler": ["boiler_upgrade"],
+ "Band A condensing gas boiler": ["boiler_upgrade"],
+ "Solar panel recommendation": [],
+ "Double glazing recommendation": [],
+ "Solid wall insulation recommendation": [],
+ "Fuel change recommendation": [],
+ "PV Cells recommendation": [],
+ "Replacement glazing units": ["double_glazing"],
+ "Heating controls (time and temperature zone control)": ["time_temperature_zone_control"],
+ "High heat retention storage heaters": ["high_heat_retention_storage_heater"],
+ "Gas condensing boiler": ["boiler_upgrade"],
+ "Change room heaters to condensing boiler": ["boiler_upgrade"],
+ "Cylinder thermostat": ["cylinder_thermostat"],
+ "Heat recovery system for mixer showers": ["heat_recovery_shower"],
+ "Room-in-roof insulation": ["room_in_roof_insulation"],
+ "Fan assisted storage heaters": [],
+ "Fan-assisted storage heaters": [],
+ "Step 1:": [],
+ "Step 2:": [],
+ 'Step 3:': [],
+ "Biomass stove with boiler": [],
+ "Replace boiler with biomass boiler": [],
+ "Heating controls (room thermostat and thermostatic radiator valves)": [
+ "roomstat_programmer_trvs", "time_temperature_zone_control"
+ ],
+ "Heating controls (programmer, and thermostatic radiator valves)": [
+ "roomstat_programmer_trvs", "time_temperature_zone_control"
+ ],
+ "Heating controls (programmer and TRVs)": [
+ "roomstat_programmer_trvs", "time_temperature_zone_control"
+ ],
+ "Heating controls (programmer and room thermostat)": [
+ "roomstat_programmer_trvs", "time_temperature_zone_control"
+ ],
+ "Replacement warm air unit": [],
+ "Secondary glazing": ["secondary_glazing"],
+ "Condensing heating unit": ["boiler_upgrade"],
+ '???': [],
+ 'Solar photovoltaic panels, 2.5kWp': ["solar_pv"],
+ 'Heating controls (programmer, room thermostat and thermostatic radiator valves)': [
+ "roomstat_programmer_trvs", "time_temperature_zone_control"
+ ],
+ 'Translation missing: en.improvement_code.41.title': [],
+ "Condensing boiler (separate from the range cooker)": ["boiler_upgrade"],
+ "Heating controls (programmer and thermostatic radiator valves)": [
+ "roomstat_programmer_trvs", "time_temperature_zone_control"
+ ]
+ }
+
+ survey = True
+ if sap_2012_date is not None:
+ certificate_date = datetime.strptime(assessment_data["Date of certificate"], "%d %B %Y")
+ if certificate_date < pd.to_datetime(sap_2012_date):
+ survey = False
+
+ formatted_recommendations = []
+ for rec in recommendations:
+ mapped = measure_map[rec["measure"]]
+ for measure in mapped:
+ if measure == "cavity_wall_insulation" and "solid brick" in self.walls[0].lower():
+ measure = "extension_cavity_wall_insulation"
+ to_append = {
+ "type": measure,
+ "sap_points": rec["sap_points"],
+ "survey": survey,
+ }
+ if measure == "solar_pv":
+ to_append["suitable"] = True
+ formatted_recommendations.append(to_append)
+
+ return formatted_recommendations
+
+ @classmethod
+ def get_from_epc(cls, epc):
+ # Attempt both methods:
+ try:
+ searcher = cls(address=epc["address"], postcode=epc["postcode"])
+ find_epc_data = searcher.retrieve_newest_find_my_epc_data()
+ except Exception as e:
+ logger.error(f"Error retrieving find my epc data: {e}")
+ # We attempt with the backup add
+ searcher = cls(address=epc["address1"], postcode=epc["postcode"])
+ find_epc_data = searcher.retrieve_newest_find_my_epc_data()
+
+ non_invasive_recommendations = {
+ "uprn": epc["uprn"],
+ "address": epc["address"],
+ "postcode": epc["postcode"],
+ "recommendations": find_epc_data["recommendations"],
+ }
+
+ return non_invasive_recommendations
diff --git a/etl/find_my_epc/requirements.txt b/etl/find_my_epc/requirements.txt
new file mode 100644
index 00000000..9a3fc73f
--- /dev/null
+++ b/etl/find_my_epc/requirements.txt
@@ -0,0 +1,2 @@
+pandas
+beautifulsoup4
\ No newline at end of file
diff --git a/etl/funding/app.py b/etl/funding/app.py
new file mode 100644
index 00000000..fba48ca4
--- /dev/null
+++ b/etl/funding/app.py
@@ -0,0 +1,35 @@
+"""
+This scipt prepares the data, required for us to perform funding calculations. The starting data should be stored
+on the machine this is being run on, and this will prepare the information and upload if
+"""
+import pandas as pd
+from utils.s3 import save_csv_to_s3
+
+STAGE = "dev"
+DATA_BUCKET = "retrofit-data-{stage}"
+PROJECTS_SCORES_MATRIX_LOCATION = "/Users/khalimconn-kowlessar/Downloads/ECO4 Full Project Scores Matrix.csv"
+WHLG_ELIGIBLE_POSTCODES = "/Users/khalimconn-kowlessar/Downloads/WHLG-eligible-postcodes.xlsx"
+
+
+def app():
+ # Read in the project scores matrix
+ project_scores_matrix = pd.read_csv(PROJECTS_SCORES_MATRIX_LOCATION)
+
+ # Store in AWS S3
+ save_csv_to_s3(
+ dataframe=project_scores_matrix,
+ bucket_name=DATA_BUCKET.format(stage=STAGE),
+ file_name="funding/ECO4 Full Project Scores Matrix.csv"
+ )
+
+ # Read in the Warm Homes Local Grant eligible postcodes data
+ whlg_eligible_postcodes = pd.read_excel(WHLG_ELIGIBLE_POSTCODES, sheet_name="Eligible postcodes", header=1)
+ # We tidy up the data before we store
+ whlg_eligible_postcodes = whlg_eligible_postcodes[["Postcode"]]
+ whlg_eligible_postcodes["Postcode"] = whlg_eligible_postcodes["Postcode"].str.lower()
+
+ save_csv_to_s3(
+ dataframe=whlg_eligible_postcodes,
+ bucket_name=DATA_BUCKET.format(stage=STAGE),
+ file_name="funding/whlg eligible postcodes.csv"
+ )
diff --git a/etl/lodgement/app.py b/etl/lodgement/app.py
new file mode 100644
index 00000000..c1da35dd
--- /dev/null
+++ b/etl/lodgement/app.py
@@ -0,0 +1,326 @@
+import os
+
+import pandas as pd
+
+import utils.file_data_extraction as file_extraction_tools
+from utils.fullSapParser import FullSapParser
+from utils.OsmosisCondtionReportParser import OsmosisConditionReportParser
+
+output_template = {
+ "Property Address": None,
+ "Osm. ID": None,
+ "Postcode": None,
+ "City/County": None,
+ "District/Town": None,
+ "Funding Stream": None,
+ # "Risk Path": None,
+ "Local Authority": None,
+ "Trustmark Lodgement ID": None,
+ "Certificate Number": None,
+ "EWI UMR": None,
+ "Loft UMR": None,
+ "Windows UMR": None,
+ "Doors UMR": None,
+ "Measure Lodgement Date": None,
+ "Full Lodgement Date": None,
+ "Owner - Name": None,
+ "Owner - Phone": None,
+ "Owner - Email": None,
+ "Tenant - Name": None,
+ "Tenant - Phone": None,
+ "R. Assessor - Name": None,
+ "R. Coordinator - Name": None,
+ "Trustmark Licence Number": None,
+ "Retrofit Assessment Date": None,
+ "Company Name": None,
+ "Retrofit Designer Name": None,
+ "Property Type": None,
+ "Property Detachment": None,
+ "No. of Bedrooms": None,
+ "Property age": None,
+ "SAP Rating Pre (from IMA)": None,
+ "Pre Heat Transfer": None,
+ "Pre Total Floor Area": None,
+ "Pre Heat Demand": None,
+ "Pre Air Tightness": None,
+ "SAP Rating Post (from EPC)": None,
+ "Post Heat Transfer": None,
+ "Post Total Floor Area": None,
+ "Post Heat Demand": None,
+ "Post Air Tightness": None,
+ "Number of Eligible Measures Installed": None,
+ "Total Cost of Works": None,
+ "Annual Fuel Saving (MTP)": None,
+}
+
+
+def update_dictionary_with_check(dictionary, updates):
+ """
+ Updates a dictionary with key-value pairs, raising an error if the key does not exist.
+
+ Args:
+ dictionary (dict): The dictionary to update.
+ updates (dict): The updates to apply.
+
+ Raises:
+ KeyError: If a key in updates does not exist in the dictionary.
+ """
+ for key, value in updates.items():
+ if key not in dictionary:
+ raise KeyError(f"Key '{key}' does not exist in the dictionary.")
+ dictionary[key] = value
+
+
+def handler():
+ """
+ This is a simple application that will extract the data from documents that have been uploaded to Sharepoint
+ to populate the lodgement spreadsheet with
+ :return:
+ """
+
+ # Ths source data will eventually come from Sharepoint
+ source_data_path = "/Users/khalimconn-kowlessar/Documents/hestia/Lodgment Pilot"
+ output_template_file = "Trustmark Details - Template REV.25.11.24.xlsx"
+ funding_stream = "HUG2"
+ customer_name = "Shropshire Council"
+ customer_phone = "0345 678 9000"
+ customer_email = "affordablewarmth@shropshire.gov.uk"
+
+ # TODO: In order for this to go live, we need to use Poppler, which needs to be installed
+ # w/ brew install poppler
+ # We also need to install Tesseract: brew install tesseract
+
+ # List the folders in the source data path
+ folders = [x for x in os.listdir(source_data_path) if os.path.isdir(os.path.join(source_data_path, x))]
+
+ extractors = {
+ "elmhurst epr": file_extraction_tools.ElmhurstEprExtractor,
+ "elmhurst summary report": file_extraction_tools.ElmhurstSummaryReportExtractor,
+ "osmosis condition report": OsmosisConditionReportParser,
+ "elmhurst evidence report": None,
+ "full sap xml": FullSapParser,
+ "pulse air permeability": file_extraction_tools.PulseAirPermeabilityExtractor,
+ "elmhurst project handover": file_extraction_tools.ElmhurstProjectHandoverExtractor,
+ "core logic pas assessment report": file_extraction_tools.CoreLogicPasAssessmentReportExtractor,
+ }
+
+ extracted = []
+ for property_folder in folders:
+
+ property_folder_path = os.path.join(source_data_path, property_folder)
+ # List the folders in the source data path
+ subfolders = [
+ x for x in os.listdir(property_folder_path) if os.path.isdir(os.path.join(property_folder_path, x))
+ ]
+ coord_folder = os.path.join(property_folder_path, [f for f in subfolders if "RA Coordinator Info" in f][0])
+
+ # Get the contents of the folder
+ coordinator_folder_contents = [
+ file for file in os.listdir(coord_folder) if os.path.isfile(os.path.join(coord_folder, file))
+ ]
+
+ # We detect the various file types
+ extracted_contents = {}
+ for filename in coordinator_folder_contents:
+ filepath = os.path.join(coord_folder, filename)
+ if file_extraction_tools.is_pdf(filepath):
+ report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath)
+ if report_type is None:
+ raise ValueError(f"Unknown report type for {filename}")
+
+ file_extractor = extractors[report_type]
+ if file_extractor is None:
+ continue
+
+ extracted_contents[report_type] = file_extractor(filepath).extract()
+
+ if file_extraction_tools.is_xml(filepath):
+ xml_type = file_extraction_tools.detect_xml_report_type(xml_path=filepath)
+ if xml_type is None:
+ raise ValueError(f"Unknown report type for {filename}")
+ file_extractor = extractors.get(xml_type)
+ if file_extractor is None:
+ continue
+
+ extracted_contents[xml_type] = file_extractor(filepath).extract()
+
+ att_folder = os.path.join(property_folder_path, [f for f in subfolders if "Air Tightness Tests" in f][0])
+ att_folder_contents = [
+ file for file in os.listdir(att_folder) if os.path.isfile(os.path.join(att_folder, file))
+ ]
+
+ for filename in att_folder_contents:
+ filepath = os.path.join(att_folder, filename)
+ if file_extraction_tools.is_pdf(filepath):
+ report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath)
+ if report_type is None:
+ raise ValueError(f"Unknown report type for {filename}")
+ file_extractor = extractors[report_type]
+
+ if file_extractor is None:
+ continue
+
+ extracted_contents[report_type] = file_extractor(filepath).extract()
+
+ lodgement_folder = os.path.join(
+ property_folder_path, [f for f in subfolders if "TrustMark Lodgement" in f][0]
+ )
+ # Within the lodgement folder, we want the required documents sub-folder
+ lodgement_subfolders = [
+ file for file in os.listdir(lodgement_folder) if os.path.isdir(os.path.join(lodgement_folder, file))
+ ]
+ required_documents_folder = os.path.join(
+ lodgement_folder, [f for f in lodgement_subfolders if "required documents" in f.lower()][0]
+ )
+ # List the contents
+ required_documents_contents = [
+ file for file in os.listdir(required_documents_folder) if
+ os.path.isfile(os.path.join(required_documents_folder, file))
+ ]
+
+ # There are only a few file types we actually want to process in here for the moment
+ for filename in required_documents_contents:
+ filepath = os.path.join(required_documents_folder, filename)
+ if file_extraction_tools.is_pdf(filepath):
+ report_type = file_extraction_tools.detect_pdf_report_type(pdf_path=filepath)
+ if report_type != "elmhurst project handover":
+ continue
+ file_extractor = extractors[report_type]
+
+ extracted_contents[report_type] = file_extractor(filepath).extract()
+
+ output_row_data = output_template.copy()
+
+ # dict_keys([ 'City/County', 'District/Town',
+ # 'Local Authority', 'Trustmark Lodgement ID', 'Certificate Number', 'EWI UMR', 'Loft UMR', 'Windows UMR',
+ # 'Doors UMR', 'Measure Lodgement Date', 'Full Lodgement Date', 'Owner - Name', 'Owner - Phone',
+ # 'Owner - Email', 'Tenant - Name', 'Tenant - Phone',
+ # 'Trustmark Licence Number',
+ # Pre Air Tightness', 'SAP Rating Post (from EPC)', 'Post Heat
+ # Transfer', 'Post Total Floor Area', 'Post Heat Demand', 'Post Air Tightness',
+ # 'Total Cost of Works', 'Annual Fuel Saving (MTP)'])
+
+ update_dictionary_with_check(
+ output_row_data,
+ {
+ "Funding Stream": funding_stream,
+ "Property Address": property_folder.split(")")[1].strip(),
+ "Osm. ID": property_folder.split(")")[0].strip().lstrip("(").strip(),
+ }
+ )
+
+ if extracted_contents.get("elmhurst epr"):
+ total_floor_area = sum(
+ [x["Floor Area (m2)"] for x in extracted_contents["elmhurst epr"]["Building Parts"]] +
+ # Get the conservatory floor area
+ [extracted_contents["elmhurst epr"]["Conservatory"]["Conservatory Floor Area"]]
+ )
+
+ pre_heat_transfer = extracted_contents["elmhurst epr"]["Primary Energy Use Intensity (kWh/m2/yr)"]
+ pre_heat_demand = (
+ extracted_contents["elmhurst epr"]["Primary Energy Use Intensity (kWh/m2/yr)"] * total_floor_area
+ )
+
+ epr_to_insert = {
+ "Postcode": extracted_contents["elmhurst epr"]["Postcode"],
+ "City/County": extracted_contents["elmhurst epr"]["County"],
+ "District/Town": extracted_contents["elmhurst epr"]["Town"],
+ "Local Authority": None,
+ 'SAP Rating Pre (from IMA)': extracted_contents["elmhurst epr"]["Current SAP Rating"],
+ 'Pre Heat Transfer': pre_heat_transfer,
+ 'Pre Total Floor Area': total_floor_area,
+ 'Pre Heat Demand': pre_heat_demand,
+ "R. Assessor - Name": extracted_contents["elmhurst epr"]["Assessor Name"],
+ "Retrofit Assessment Date": extracted_contents["elmhurst epr"]["Assessment Date"],
+ }
+ update_dictionary_with_check(
+ output_row_data,
+ epr_to_insert
+ )
+
+ if extracted_contents.get("full sap xml"):
+ xml_to_insert = {
+ "Property Type": extracted_contents["full sap xml"]["Property Type"],
+ "Property Detachment": extracted_contents["full sap xml"]["Built Form"],
+ "Property age": extracted_contents["full sap xml"]["Age Band"],
+
+ }
+ update_dictionary_with_check(
+ output_row_data,
+ xml_to_insert
+ )
+
+ if extracted_contents.get("osmosis condition report"):
+ cr_to_insert = {
+ "No. of Bedrooms": extracted_contents["osmosis condition report"]["No. of Bedrooms"],
+ # "Risk Path": extracted_contents["osmosis condition report"]["Risk Assessment Pathway"],
+ }
+ update_dictionary_with_check(
+ output_row_data,
+ cr_to_insert
+ )
+
+ if extracted_contents.get("elmhurst summary report"):
+ total_floor_area = sum(
+ [x["Floor Area (m2)"] for x in extracted_contents["elmhurst summary report"]["Building Parts"]] +
+ # Get the conservatory floor area
+ [extracted_contents["elmhurst summary report"]["Conservatory"]["Conservatory Floor Area"]]
+ )
+
+ pre_heat_transfer = (
+ extracted_contents["elmhurst summary report"]["Primary Energy Use Intensity (kWh/m2/yr)"]
+ )
+ pre_heat_demand = None # Don't have this
+
+ summary_to_insert = {
+ "Postcode": extracted_contents["elmhurst summary report"]["Postcode"],
+ "City/County": extracted_contents["elmhurst summary report"]["County"],
+ "District/Town": extracted_contents["elmhurst summary report"]["Town"],
+ 'SAP Rating Pre (from IMA)': extracted_contents["elmhurst summary report"]["Current SAP Rating"],
+ 'Pre Heat Transfer': pre_heat_transfer,
+ 'Pre Total Floor Area': total_floor_area,
+ 'Pre Heat Demand': pre_heat_demand,
+ "R. Assessor - Name": extracted_contents["elmhurst summary report"]["Assessor Name"],
+ "Retrofit Assessment Date": extracted_contents["elmhurst summary report"]["Assessment Date"],
+ }
+
+ update_dictionary_with_check(
+ output_row_data,
+ summary_to_insert
+ )
+
+ if extracted_contents.get("pulse air permeability"):
+ # We extract the AP50 number
+ results_table = extracted_contents["pulse air permeability"]["Results Table"]
+ ap50 = [x["Extrapolated @ 50PA"] for x in results_table if x["Metric"] == "Air Permeability"][0]
+ update_dictionary_with_check(
+ output_row_data,
+ {"Pre Air Tightness": ap50}
+ )
+
+ if extracted_contents.get("elmhurst project handover"):
+ handover_to_insert = {
+ "Number of Eligible Measures Installed": len(
+ extracted_contents["elmhurst project handover"]["Measures Fitted"]
+ ),
+ "Retrofit Designer Name": extracted_contents["elmhurst project handover"]["Designer Name"],
+ "Company Name": extracted_contents["elmhurst project handover"]["Installer Name"],
+ "R. Coordinator - Name": extracted_contents["elmhurst project handover"]["Retrofit Coordinator Name"],
+ }
+ update_dictionary_with_check(output_row_data, handover_to_insert)
+
+ if extracted_contents.get("core logic pas assessment report"):
+ cr_to_insert = {
+ "No. of Bedrooms": extracted_contents["core logic pas assessment report"]["Number of bedrooms"],
+ }
+ update_dictionary_with_check(
+ output_row_data,
+ cr_to_insert
+ )
+
+ extracted.append(output_row_data)
+
+ extracted_df = pd.DataFrame(extracted)
+
+ extracted_df.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Lodgment Pilot/poc-extrcted-data.csv",
+ index=False)
diff --git a/etl/lodgement/requirements.txt b/etl/lodgement/requirements.txt
new file mode 100644
index 00000000..412aed3b
--- /dev/null
+++ b/etl/lodgement/requirements.txt
@@ -0,0 +1,14 @@
+PyPDF2
+pandas
+tqdm
+openpyxl
+boto3
+usaddress==0.5.11
+fuzzywuzzy==0.18.0
+python-dotenv
+python-docx
+pymupdf
+pytesseract
+pdf2image
+pillow
+pdfplumber
diff --git a/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py
new file mode 100644
index 00000000..3bd87a8c
--- /dev/null
+++ b/etl/route_march/oo_prs_additional_units/oo_prs_additional_units.py
@@ -0,0 +1,240 @@
+import os
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from dotenv import load_dotenv
+from urllib.parse import urlencode
+from epc_api.client import EpcClient
+from utils.logger import setup_logger
+from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
+
+from recommendations.recommendation_utils import (
+ estimate_perimeter,
+ estimate_external_wall_area,
+ estimate_number_of_floors
+)
+
+logger = setup_logger()
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+CONFIG = [
+ {
+ "filepath": "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/Surveyors Sites for Week Commencing "
+ "11.11.2024.xlsx",
+ "tab": "SETTLE GBIS x 242 ",
+ "postcode_column": "Postcode",
+ },
+ {
+ "filepath": "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/Surveyors Sites for Week Commencing "
+ "11.11.2024.xlsx",
+ "tab": "ACIS GBIS x 76",
+ "postcode_column": "Postcode",
+ },
+ {
+ "filepath": "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/Surveyors Sites for Week Commencing "
+ "11.11.2024.xlsx",
+ "tab": "SOUTHERN GBIS x 150",
+ "postcode_column": "Postcode",
+ },
+ {
+ "filepath": "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/Surveyors Sites for Week Commencing "
+ "11.11.2024.xlsx",
+ "tab": "COMMUNITY HOUSING GBIS x 199",
+ "postcode_column": "Postcode",
+ },
+ {
+ "filepath": "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/Surveyors Sites for Week Commencing "
+ "11.11.2024.xlsx",
+ "tab": "EASTLIGHT GBIS x 42",
+ "postcode_column": "Postcode",
+ },
+]
+
+CAVITY_WALL_DESCRIPTIONS = [
+ "Cavity wall, as built, no insulation (assumed)",
+ "Cavity wall, as built, partial insulation (assumed)",
+ "Cavity wall, as built, insulated (assumed)",
+ "Cavity wall, with internal insulation",
+ "Cavity wall, with external insulation",
+]
+
+ROOF_DESCRIPTIONS = [
+ "Pitched, no insulation",
+ "Pitched, no insulation (assumed)",
+ "Pitched, 25 mm loft insulation",
+ "Pitched, 50 mm loft insulation",
+ "Pitched, 75 mm loft insulation",
+ "Pitched, 100 mm loft insulation",
+ "Pitched, 150 mm loft insulation",
+ "Pitched, limited insulation (assumed)",
+ "Pitched, insulated (assumed)",
+]
+
+SOCIAL_TENURES = ["Rented (social)", "rental (social)"]
+
+
+def process_postcode_epcs(postcode, client):
+ params = {"postcode": postcode.rstrip().lstrip()}
+ url = os.path.join(client.domestic.host, "search") + "?" + urlencode({"size": 1000})
+ response = client.domestic.call(method="get", url=url, params=params)
+ if "rows" not in response:
+ logger.warning("No EPCs found for postcode %s", postcode)
+ return pd.DataFrame()
+ postcode_epcs = pd.DataFrame(response["rows"])
+
+ # Processing code here
+ postcode_epcs["uprn"] = np.where(
+ pd.isnull(postcode_epcs["uprn"]),
+ postcode_epcs["address"],
+ postcode_epcs["uprn"]
+ )
+ postcode_epcs = postcode_epcs.sort_values("lodgement-date", ascending=False)
+ postcode_epcs = postcode_epcs.drop_duplicates("uprn", keep="first")
+ return postcode_epcs
+
+
+def filter_and_prepare_epcs(epcs):
+ epcs["Is Cavity Property"] = epcs["walls-description"].isin(CAVITY_WALL_DESCRIPTIONS) & (
+ epcs["current-energy-efficiency"].astype(int) <= 72
+ )
+ epcs["Solar and Loft"] = (
+ epcs["roof-description"].isin(ROOF_DESCRIPTIONS)
+ ) & (
+ epcs["photo-supply"].isin(["0", "", "0.0"])
+ ) & (
+ epcs["current-energy-efficiency"].astype(int) <= 68
+ )
+ epcs = epcs[epcs["Is Cavity Property"] | epcs["Solar and Loft"]]
+ epcs = epcs[~epcs["tenure"].isin(SOCIAL_TENURES)]
+ return epcs
+
+
+def rename_and_add_columns(epcs):
+ # Retrieve just the data we need
+ epcs = epcs[
+ [
+ "uprn",
+ "address",
+ "postcode",
+ "property-type",
+ "built-form",
+ "inspection-date",
+ "current-energy-rating",
+ "current-energy-efficiency",
+ "roof-description",
+ "walls-description",
+ "transaction-type",
+ # New fields needed
+ "secondheat-description",
+ "total-floor-area",
+ "construction-age-band",
+ "floor-height",
+ "number-habitable-rooms",
+ "mainheat-description",
+ #
+ "energy-consumption-current", # kwh/m2
+ "tenure",
+ "Is Cavity Property",
+ "Solar and Loft",
+ ]
+ ]
+
+ epcs = epcs.rename(
+ columns={
+ "address": "Address",
+ "postcode": "Postcode",
+ "inspection-date": "Date of last EPC",
+ "current-energy-efficiency": "SAP score on register",
+ "current-energy-rating": "EPC rating on register",
+ "property-type": "Property Type",
+ "built-form": "Archetype",
+ "total-floor-area": "Property Floor Area",
+ "construction-age-band": "Property Age Band",
+ "floor-height": "Property Floor Height",
+ "number-habitable-rooms": "Number of Habitable Rooms",
+ "walls-description": "Wall Construction",
+ "roof-description": "Roof Construction",
+ "mainheat-description": "Heating Type",
+ "secondheat-description": "Secondary Heating",
+ "transaction-type": "Reason for last EPC",
+ "energy-consumption-current": "Heat Demand (kWh/m2)",
+ "tenure": "Tenure"
+ }
+ )
+
+ epcs["Number of Habitable Rooms"] = epcs["Number of Habitable Rooms"].astype(int)
+ epcs["Property Floor Area"] = epcs["Property Floor Area"].astype(float)
+
+ # Add additional columns as in your original code
+ epcs["Estimated Number of Floors"] = epcs.apply(
+ lambda x: estimate_number_of_floors(x["Property Type"]) if pd.notnull(x["Property Type"]) else None, axis=1
+ )
+
+ epcs["Estimated Perimeter (m)"] = epcs.apply(
+ lambda x: estimate_perimeter(
+ x["Property Floor Area"] / x["Estimated Number of Floors"],
+ x["Number of Habitable Rooms"] / x["Estimated Number of Floors"]
+ ), axis=1
+ )
+ epcs["Estimated Heat Loss Perimeter (m2)"] = epcs.apply(
+ lambda x: estimate_external_wall_area(
+ x["Estimated Number of Floors"],
+ float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.4,
+ x["Estimated Perimeter (m)"],
+ x["Archetype"]
+ ), axis=1
+ )
+ epcs["Roof Insulation Thickness"] = epcs.apply(
+ lambda x: RoofAttributes(description=x["Roof Construction"]).process()[
+ "insulation_thickness"] if pd.notnull(x["Roof Construction"]) else None,
+ axis=1
+ )
+ return epcs
+
+
+def main():
+ """
+ This application is used to identify additional units that are private rentals or owner occupies that can be
+ included in the route marches
+
+ Required inputs are the following:
+ - An excel file that contains one or many tabs that include the addresses to be visited
+ """
+
+ # This should be set:
+ output_filepath = (
+ "/Users/khalimconn-kowlessar/Documents/hestia/Route Marches/PRS and OO properties - WC 11.11.2024.xlsx"
+ )
+ client = EpcClient(auth_token=EPC_AUTH_TOKEN)
+ writer = pd.ExcelWriter(output_filepath, engine="xlsxwriter")
+
+ for config in CONFIG:
+ logger.info("Processing %s", config["tab"])
+ # Read in the data
+ route_march_addresses = pd.read_excel(
+ config["filepath"],
+ sheet_name=config["tab"],
+ engine="openpyxl"
+ )
+
+ postcodes = route_march_addresses[config["postcode_column"]].unique()
+
+ epcs = []
+ for postcode in tqdm(postcodes):
+ postcode_epcs = process_postcode_epcs(postcode, client)
+ if postcode_epcs.empty:
+ continue
+ epcs.append(postcode_epcs)
+
+ # Concatenate all postcodes' data and filter it
+ epcs = pd.concat(epcs)
+ epcs = filter_and_prepare_epcs(epcs)
+ epcs = rename_and_add_columns(epcs)
+
+ sheet_name = config["tab"][:31] # Excel sheet names max length of 31 characters
+ epcs.to_excel(writer, sheet_name=sheet_name, index=False)
+
+ # Save and close the writer outside the loop
+ writer.close()
+ logger.info("Data successfully written to %s", output_filepath)
diff --git a/etl/route_march/oo_prs_additional_units/requirements.txt b/etl/route_march/oo_prs_additional_units/requirements.txt
new file mode 100644
index 00000000..e2f4832c
--- /dev/null
+++ b/etl/route_march/oo_prs_additional_units/requirements.txt
@@ -0,0 +1,10 @@
+openpyxl
+epc-api-python==1.0.2
+numpy==2.1.2
+pandas==2.2.3
+usaddress==0.5.11
+fuzzywuzzy==0.18.0
+boto3==1.35.44
+python-dotenv
+tqdm
+xlsxwriter
\ No newline at end of file
diff --git a/etl/xml_survey_extraction/XmlParser.py b/etl/xml_survey_extraction/XmlParser.py
index ffe191a4..ef8daf51 100644
--- a/etl/xml_survey_extraction/XmlParser.py
+++ b/etl/xml_survey_extraction/XmlParser.py
@@ -9,7 +9,8 @@ from etl.xml_survey_extraction.pcdb import heating_data
PROPERTY_TYPE_LOOKUP = {
"0": "House",
"House": "House",
- "2": "Flat"
+ "2": "Flat",
+ "3": "Maisonette",
}
@@ -107,11 +108,13 @@ class XmlParser:
BUILT_FORM_MAP = {
"1": "Detached",
+ "2": "Semi-Detached",
"3": "End-Terrace",
"4": "Mid-Terrace",
}
GLAZED_AREA_MAP = {
+ "2": "More than Typical",
"4": "Much More Than Typical"
}
@@ -120,7 +123,9 @@ class XmlParser:
}
TRANSACTION_TYPE_MAP = {
- "13": "ECO assessment"
+ "5": "Rented (social)",
+ "13": "ECO assessment",
+ "14": "Stock condition survey",
}
TENURE_MAP = {
@@ -131,7 +136,8 @@ class XmlParser:
TARIFF_MAP = {
"1": "Dual",
- "2": "Single"
+ "2": "Single",
+ "3": "Unknown"
}
def __init__(self, file, filekey, surveyor_company, uprn=None):
@@ -400,8 +406,13 @@ class XmlParser:
]
wall_areas = sum([float(f["heat_loss_perimeter"]) * float(f["room_height"]) for f in main_dwelling_floors])
- window_areas = sum([float(w["window_area"]) for w in main_dwelling_windows])
- return wall_areas - window_areas
+ window_areas = [float(w["window_area"]) for w in main_dwelling_windows if w["window_area"] is not None]
+ if not window_areas:
+ # We discount 10% of the wall area
+ insulation_wall_area = wall_areas * 0.9
+ else:
+ insulation_wall_area = wall_areas - sum(window_areas)
+ return insulation_wall_area
def extract_additional_data(self):
@@ -415,7 +426,8 @@ class XmlParser:
main_dwelling_windows = [w for w in self.windows if w["window_location"] == "0"]
number_of_windows = len(main_dwelling_windows)
- windows_area = sum([float(w["window_area"]) for w in main_dwelling_windows])
+ windows_area = [float(w["window_area"]) for w in main_dwelling_windows if w["window_area"] is not None]
+ windows_area = sum(windows_area) if windows_area else None
boolean_lookup = {
"true": True,
@@ -427,6 +439,7 @@ class XmlParser:
cylinder_insulation_type = {
None: "",
"1": "Foam",
+ "2": "Jacket"
}
cylinder_insulation_thickness = int(
@@ -461,7 +474,7 @@ class XmlParser:
"cylinder_thermostat": cylinder_thermostat,
"main_dwelling_ground_floor_area": float(main_dwelling_ground_floor_area),
"number_of_windows": int(number_of_windows),
- "windows_area": float(windows_area),
+ "windows_area": float(windows_area) if windows_area is not None else windows_area,
}
def get_node_value(self, tag_name):
@@ -769,9 +782,10 @@ class XmlParser:
:return:
"""
- sap_windows = self.xml.getElementsByTagName("SAP-Windows")[0].getElementsByTagName("SAP-Window")
-
glazing_type_lookup = {
+ "ND": "Single glazing",
+ "1": "double glazing installed before 2002",
+ "2": "double glazing installed during or after 2002",
"3": "double glazing, unknown install date",
"5": "Single glazing",
}
@@ -787,6 +801,40 @@ class XmlParser:
"8": "North West"
}
+ sap_windows = self.xml.getElementsByTagName("SAP-Windows")
+
+ if not sap_windows:
+ # We look for Multi-Glazed-Proportion
+ multiple_glazing_type = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName(
+ "Multiple-Glazing-Type"
+ )[0].firstChild.nodeValue
+
+ pvc_frame = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName(
+ "PVC-Window-Frames"
+ )
+
+ pvc_frame = pvc_frame[0].firstChild.nodeValue if pvc_frame else None
+
+ multple_glazed_proportion = self.xml.getElementsByTagName("SAP-Property-Details")[0].getElementsByTagName(
+ "Multiple-Glazed-Proportion"
+ )[0].firstChild.nodeValue
+
+ self.windows = [
+ {
+ "window_location": "0",
+ "window_area": None,
+ "window_type": None,
+ "glazing_type": glazing_type_lookup[multiple_glazing_type],
+ "pvc_frame": pvc_frame,
+ "glazing_gap": None,
+ "orientation": None,
+ "multple_glazed_proportion": multple_glazed_proportion
+ }
+ ]
+ return
+
+ sap_windows = sap_windows[0].getElementsByTagName("SAP-Window")
+
self.windows = [
self._parse_windows_content(
window=window,
diff --git a/input_property_list.csv b/input_property_list.csv
deleted file mode 100644
index dc677c88..00000000
--- a/input_property_list.csv
+++ /dev/null
@@ -1,12 +0,0 @@
-address,postcode,Notes,,,,
-28 Distillery Wharf,W6 9bf,,,,,
-Flat 14 Godley V C House,E2 0LP,,,,,
-49 Elderfield Road,E5 0LF,,,,,
-26 Stanhope Road,N6 5NG,,,,,
-Flat 3 Frederick Building,N1 4BD,,,,,
-Flat 4 Frederick Building,N1 4BD,,,,,
-"Flat 28, 22 Adelina Grove",E1 3BX,,,,,
-"Flat 39, 239 Long Lane",SE1 4PT,,,,,
-"1, Westview, Somerby",LE14 2QH,This property has an unfilled cavity,,,,
-"59, Ashdale",CM23 4EB,This property has a partially filled cavity,,,,
-88 Cleveland Avenue,DL3 7BE,This property has a filled cavity,,,,
\ No newline at end of file
diff --git a/keyzy_pilot.csv b/keyzy_pilot.csv
deleted file mode 100644
index b972bcf9..00000000
--- a/keyzy_pilot.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-address,postcode,Notes,,,,
-2 South Terrace,NN1 5JY,,,,,
-25 Albert Street,PO12 4TY,,,,,
\ No newline at end of file
diff --git a/recommendations/Costs.py b/recommendations/Costs.py
index 5554245f..2d486191 100644
--- a/recommendations/Costs.py
+++ b/recommendations/Costs.py
@@ -37,22 +37,25 @@ MCS_SOLAR_PV_COST_DATA = {
"average_cost_per_kwh-Northern Ireland": 1347,
}
+# Installers are now working with 435 watt panels
+PANEL_SIZE = 0.435
+
INSTALLER_SOLAR_COSTS = [
- {'n_panels': 4, 'array_kwp': 1.6, 'cost': 3040.00, 'installer': 'CEG'},
- {'n_panels': 5, 'array_kwp': 2.1, 'cost': 3201.00, 'installer': 'CEG'},
- {'n_panels': 6, 'array_kwp': 2.5, 'cost': 3363.00, 'installer': 'CEG'},
- {'n_panels': 7, 'array_kwp': 2.9, 'cost': 3524.00, 'installer': 'CEG'},
- {'n_panels': 8, 'array_kwp': 3.3, 'cost': 3686.00, 'installer': 'CEG'},
- {'n_panels': 9, 'array_kwp': 3.7, 'cost': 3847.00, 'installer': 'CEG'},
- {'n_panels': 10, 'array_kwp': 4.1, 'cost': 4009.00, 'installer': 'CEG'},
- {'n_panels': 11, 'array_kwp': 4.5, 'cost': 4170.00, 'installer': 'CEG'},
- {'n_panels': 12, 'array_kwp': 4.9, 'cost': 4332.00, 'installer': 'CEG'},
- {'n_panels': 13, 'array_kwp': 5.3, 'cost': 4835.00, 'installer': 'CEG'},
- {'n_panels': 14, 'array_kwp': 5.7, 'cost': 5015.00, 'installer': 'CEG'},
- {'n_panels': 15, 'array_kwp': 6.2, 'cost': 5176.00, 'installer': 'CEG'},
- {'n_panels': 16, 'array_kwp': 6.6, 'cost': 5338.00, 'installer': 'CEG'},
- {'n_panels': 17, 'array_kwp': 7.0, 'cost': 5500.00, 'installer': 'CEG'},
- {'n_panels': 18, 'array_kwp': 7.4, 'cost': 6021.00, 'installer': 'CEG'}
+ {'n_panels': 4, 'array_kwp': 4 * PANEL_SIZE, 'cost': 4089.25, 'installer': 'CEG'},
+ {'n_panels': 5, 'array_kwp': 5 * PANEL_SIZE, 'cost': 4242.48, 'installer': 'CEG'},
+ {'n_panels': 6, 'array_kwp': 6 * PANEL_SIZE, 'cost': 4395.71, 'installer': 'CEG'},
+ {'n_panels': 7, 'array_kwp': 7 * PANEL_SIZE, 'cost': 4548.94, 'installer': 'CEG'},
+ {'n_panels': 8, 'array_kwp': 8 * PANEL_SIZE, 'cost': 4702.17, 'installer': 'CEG'},
+ {'n_panels': 9, 'array_kwp': 9 * PANEL_SIZE, 'cost': 4855.41, 'installer': 'CEG'},
+ {'n_panels': 10, 'array_kwp': 10 * PANEL_SIZE, 'cost': 5010.95, 'installer': 'CEG'},
+ {'n_panels': 11, 'array_kwp': 11 * PANEL_SIZE, 'cost': 5166.49, 'installer': 'CEG'},
+ {'n_panels': 12, 'array_kwp': 12 * PANEL_SIZE, 'cost': 5322.04, 'installer': 'CEG'},
+ {'n_panels': 13, 'array_kwp': 13 * PANEL_SIZE, 'cost': 5657.6, 'installer': 'CEG'},
+ {'n_panels': 14, 'array_kwp': 14 * PANEL_SIZE, 'cost': 5993.16, 'installer': 'CEG'},
+ {'n_panels': 15, 'array_kwp': 15 * PANEL_SIZE, 'cost': 6328.71, 'installer': 'CEG'},
+ {'n_panels': 16, 'array_kwp': 16 * PANEL_SIZE, 'cost': 6483.33, 'installer': 'CEG'},
+ {'n_panels': 17, 'array_kwp': 17 * PANEL_SIZE, 'cost': 6637.95, 'installer': 'CEG'},
+ {'n_panels': 18, 'array_kwp': 18 * PANEL_SIZE, 'cost': 6792.57, 'installer': 'CEG'}
]
# This is the maximum number of panels that we have a cost from the installers for
INSTALLER_MAX_PANELS = 18
@@ -62,11 +65,11 @@ INSTALLER_MAX_PANELS = 18
INSTALLER_SOLAR_PV_INVERTER_COST = 7500
INSTALLER_SOLAR_PV_INVERTER_LABOUR_COST = 500 # Just a rough guess to labour costs
-INSTALLER_SCAFFOLDING_COSTS = [
- {'stories': 1, 'description': '1 Story Scaffold', 'cost': 531.00, 'installer': 'CEG'},
- {'stories': 2, 'description': '2 Story Scaffold', 'cost': 841.00, 'installer': 'CEG'},
- {'stories': 3, 'description': '3 Story Scaffold', 'cost': 1077.00, 'installer': 'CEG'}
-]
+# INSTALLER_SCAFFOLDING_COSTS = [
+# {'stories': 1, 'description': '1 Story Scaffold', 'cost': 531.00, 'installer': 'CEG'},
+# {'stories': 2, 'description': '2 Story Scaffold', 'cost': 841.00, 'installer': 'CEG'},
+# {'stories': 3, 'description': '3 Story Scaffold', 'cost': 1077.00, 'installer': 'CEG'}
+# ]
# This data is based on the MCS database, We use the larger figure between the 2023 and 2024 average,
# to be conservative
@@ -101,10 +104,10 @@ INSTALLER_ASHP_COSTS = [
BOILER_UPGRADE_SCHEME_ASHP_VALUE = 7500
INSTALLER_SOLAR_BATTERY_COSTS = [
- {'capacity_kwh': 5, 'description': 'Battery Add on', 'cost': 2700.00, 'installer': 'CEG'},
- {'capacity_kwh': 10, 'description': 'Battery Add on', 'cost': 4300.00, 'installer': 'CEG'},
- {'capacity_kwh': 5, 'description': 'Battery Retrofit existing system', 'cost': 4250.00, 'installer': 'CEG'},
- {'capacity_kwh': 10, 'description': 'Battery Retrofit Existing system', 'cost': 5950.00, 'installer': 'CEG'}
+ {'capacity_kwh': 5, 'description': 'Battery Add on', 'cost': 3769.89, 'installer': 'JJC'},
+ # {'capacity_kwh': 10, 'description': 'Battery Add on', 'cost': 4300.00, 'installer': 'CEG'},
+ # {'capacity_kwh': 5, 'description': 'Battery Retrofit existing system', 'cost': 4250.00, 'installer': 'CEG'},
+ # {'capacity_kwh': 10, 'description': 'Battery Retrofit Existing system', 'cost': 5950.00, 'installer': 'CEG'}
]
# This is based on https://www.checkatrade.com/blog/cost-guides/cost-smart-thermostat/
@@ -149,7 +152,7 @@ CONDENSING_BOILER_COSTS = {
ELECTRIC_BOILER_COSTS = 1800
# Assumes 1 hours to remove each heater (including re-decorating)
-ROOM_HEATER_REMOVAL_COST = 50
+ROOM_HEATER_REMOVAL_COST = 25
ROOM_HEATER_REMOVAL_LABOUR_HOURS = 3
# This is a cost quoted by Jim for a system flush - existig system will run more efficiently
@@ -190,6 +193,8 @@ class Costs:
# fittings and trimming doors, as well as scope for damage to the existing wall during preparation.
IWI_CONTINGENCY = 0.2
+ # For air source heat pumps, we inflate the assume cost by quite a bit to account for design and installation
+ ASHP_CONTINGENCY = 0.35
# Where there is more uncertainty, a higher contingency rate is used
HIGH_RISK_CONTINGENCY = 0.2
# When there is less uncertainty, a lower contingency rate is used
@@ -234,6 +239,13 @@ class Costs:
if self.region is None:
# Try and grab using the local-authority-label
self.region = county_to_region_map.get(self.property.data["local-authority-label"], None)
+
+ if self.region is None:
+ # Try and get the region after converting the keys to lower
+ self.region = {
+ k.lower(): v for k, v in county_to_region_map.items()
+ }.get(self.property.data["local-authority-label"].lower(), None)
+
if self.region is None:
raise ValueError("Region not found in county map")
@@ -719,8 +731,9 @@ class Costs:
"labour_days": labour_days
}
+ @classmethod
def solar_pv(
- self,
+ cls,
n_panels: int | float,
has_battery: bool = False,
array_cost=None,
@@ -758,33 +771,28 @@ class Costs:
else:
system_cost = [c for c in INSTALLER_SOLAR_COSTS if c["n_panels"] == n_panels][0]["cost"]
- total_cost = array_cost if array_cost is not None else system_cost
+ subtotal = array_cost if array_cost is not None else system_cost
if has_battery:
battery_cost = [c for c in INSTALLER_SOLAR_BATTERY_COSTS if c["capacity_kwh"] == battery_kwh][0]["cost"]
- total_cost += battery_cost
-
- scaffolding_cost = [c for c in INSTALLER_SCAFFOLDING_COSTS if c["stories"] == n_floors][0]["cost"]
- total_cost += scaffolding_cost
+ subtotal += battery_cost
if needs_inverter:
- total_cost += INSTALLER_SOLAR_PV_INVERTER_COST
+ subtotal += INSTALLER_SOLAR_PV_INVERTER_COST
# We also add an additional labour cost
- total_cost += INSTALLER_SOLAR_PV_INVERTER_LABOUR_COST
+ subtotal += INSTALLER_SOLAR_PV_INVERTER_LABOUR_COST
- # We add an additional cost for scaffolding
-
- subtotal_before_vat = total_cost / (1 + self.VAT_RATE)
-
- vat = total_cost - subtotal_before_vat
+ # Solar doesn't have VAT but we add a high risk contingency
+ # to account for design variation that we see in practice
+ total_cost = subtotal * (1 + cls.HIGH_RISK_CONTINGENCY)
# Labour hours are based on estimates from online research but an average team seems to consist of 3 people
# and most jobs take around 2 days. Assuming an 8 hour day for 3 people across 2 days, gives us 48 hours of
# labour
return {
"total": total_cost,
- "subtotal": subtotal_before_vat,
- "vat": vat,
+ "subtotal": subtotal,
+ "vat": 0,
"labour_hours": 48,
"labour_days": 2,
}
@@ -1154,7 +1162,6 @@ class Costs:
pump. This cost will include the boiler upgrade scheme grant
"""
-
# This is the average cost of a project, we'll add some additional contingency
if ashp_size is None:
@@ -1163,9 +1170,10 @@ class Costs:
cost = [x for x in INSTALLER_ASHP_COSTS if x][0]["cost"]
# We add some contingency since there are additional costs such as resizing radiators, that could be required
- total_cost = cost * (1 + self.CONTINGENCY)
- subtotal_before_vat = total_cost / (1 + self.VAT_RATE)
- vat = total_cost - subtotal_before_vat
+ subtotal = cost * (1 + self.ASHP_CONTINGENCY)
+ # The costs from installers exclude VAT
+ vat = subtotal * self.VAT_RATE
+ total_cost = subtotal + vat
# We assume 5 days installation
labour_days = 5
@@ -1173,7 +1181,7 @@ class Costs:
return {
"total": total_cost,
- "subtotal": subtotal_before_vat,
+ "subtotal": subtotal,
"vat": vat,
"labour_hours": labour_hours,
"labour_days": labour_days,
diff --git a/recommendations/DraughtProofingRecommendations.py b/recommendations/DraughtProofingRecommendations.py
index 4bd85a03..a16a94f6 100644
--- a/recommendations/DraughtProofingRecommendations.py
+++ b/recommendations/DraughtProofingRecommendations.py
@@ -26,6 +26,9 @@ class DraughtProofingRecommendations:
if not draught_proofing_recommendation_config:
return
+ # Cost is based on a £50 cost per window, based on Checkatrade
+ cost = draught_proofing_recommendation_config.get("cost", self.property.number_of_windows * 50)
+
description = (
"Draught proof doors and windows to improve energy efficiency" if
not draught_proofing_recommendation_config.get("description")
@@ -48,7 +51,7 @@ class DraughtProofingRecommendations:
"kwh_savings": 0,
"co2_equivalent_savings": 0,
"energy_cost_savings": 0,
- "total": draught_proofing_recommendation_config["cost"],
+ "total": cost,
# We use a very simple and rough estimate of 4 hours per unit
"labour_hours": draught_proofing_recommendation_config.get("labour_hours", 8),
"labour_days": draught_proofing_recommendation_config.get("labour_days", 1), # Assume 8 hour day
diff --git a/recommendations/FloorRecommendations.py b/recommendations/FloorRecommendations.py
index 25741e7a..85e1a8dc 100644
--- a/recommendations/FloorRecommendations.py
+++ b/recommendations/FloorRecommendations.py
@@ -145,7 +145,9 @@ class FloorRecommendations(Definitions):
)
return
- raise NotImplementedError("Implement me!")
+ # In this case, we have no recommendation to make. E.g., if we have a solid floor property
+ # but solid floor insulation has been excluded as a measure, we get here
+ return
@staticmethod
def _make_floor_description(material):
@@ -172,6 +174,11 @@ class FloorRecommendations(Definitions):
insulation_materials = pd.DataFrame(insulation_materials)
+ non_invasive_recs = next(
+ (r for r in self.property.non_invasive_recommendations if
+ r["type"] == insulation_materials["type"].values[0]), {}
+ )
+
lowest_selected_u_value = None
for _, insulation_material_group in insulation_materials.groupby("description"):
@@ -217,6 +224,9 @@ class FloorRecommendations(Definitions):
else:
raise NotImplementedError("Implement me!")
+ sap_points = non_invasive_recs.get("sap_points", None)
+ survey = non_invasive_recs.get("survey", False)
+
floor_ending_config = FloorAttributes(new_description).process()
floor_simulation_config = check_simulation_difference(
new_config=floor_ending_config, old_config=self.property.floor, prefix="floor_"
@@ -245,7 +255,8 @@ class FloorRecommendations(Definitions):
"description": self._make_floor_description(material),
"starting_u_value": u_value,
"new_u_value": new_u_value,
- "sap_points": None,
+ "sap_points": sap_points,
+ "survey": survey,
"already_installed": already_installed,
"simulation_config": simulation_config,
"description_simulation": {
diff --git a/recommendations/HeatingControlRecommender.py b/recommendations/HeatingControlRecommender.py
index c613aa42..bd015a79 100644
--- a/recommendations/HeatingControlRecommender.py
+++ b/recommendations/HeatingControlRecommender.py
@@ -12,7 +12,7 @@ class HeatingControlRecommender:
self.recommendation = []
- def recommend(self, heating_description, description_prefix="", description_suffix=""):
+ def recommend(self, heating_description, phase, description_prefix="", description_suffix=""):
# TODO: Many of these functions are quite similar. We can possibly create a single wrapper function that
# takes in the heating description and the description prefix/suffix, and then creates the appropriate
@@ -23,32 +23,32 @@ class HeatingControlRecommender:
# This first iteration of the recommender will provide very basic recommendation
# We recommend heating controls based on the main heating system
if heating_description in ["Room heaters, electric"]:
- self.recommend_room_heaters_electric_controls()
+ self.recommend_room_heaters_electric_controls(phase=phase)
return
if heating_description in ["Electric storage heaters", "Electric storage heaters, radiators"]:
- self.recommend_high_heat_retention_controls(description_prefix=description_prefix)
+ self.recommend_high_heat_retention_controls(description_prefix=description_prefix, phase=phase)
return
if heating_description in ["Boiler and radiators, mains gas"]:
# We can recommend roomstat programmer trvs
- self.recommend_roomstat_programmer_trvs(description_suffix=description_suffix)
+ self.recommend_roomstat_programmer_trvs(description_suffix=description_suffix, phase=phase)
# We can also recommend time and temperature zone controls
- self.recommend_time_temperature_zone_controls(description_suffix=description_suffix)
+ self.recommend_time_temperature_zone_controls(description_suffix=description_suffix, phase=phase)
return
if heating_description in ["Boiler and radiators, electric"]:
- self.recommend_roomstat_programmer_trvs()
+ self.recommend_roomstat_programmer_trvs(phase=phase)
return
if heating_description in ["Air source heat pump, radiators, electric"]:
# For an ASHP, we can recommend time and temperature zone controls, as well as programmer, trvs and a bypass
# which are common configurations for ASHPs
- self.recommend_time_temperature_zone_controls()
+ self.recommend_time_temperature_zone_controls(phase=phase)
# self.recommend_programmer_trvs_bypass()
- def recommend_room_heaters_electric_controls(self):
+ def recommend_room_heaters_electric_controls(self, phase):
"""
If the home has Room heaters, electric, we start by identifying potential heating controls that could
be upgraded, that would provide a practical impact. This will be the least invasive improvement.
@@ -88,6 +88,9 @@ class HeatingControlRecommender:
self.recommendation.append(
{
+ "phase": phase,
+ "type": "heating",
+ "measure_type": "programmer_appliance_thermostat",
"description": "upgrade heating controls to Programmer and Appliance or Smart Thermostats",
**self.costs.programmer_and_appliance_thermostat(has_programmer=has_programmer),
"simulation_config": simulation_config
@@ -97,7 +100,7 @@ class HeatingControlRecommender:
# We don't implement any other recommendations right now
return
- def recommend_high_heat_retention_controls(self, description_prefix=""):
+ def recommend_high_heat_retention_controls(self, phase, description_prefix=""):
"""
When applicable, we recommend upgrading the heating controls to high heat retention controls. This is a
specific type of control system that is designed to work with electric storage heaters. It is a more
@@ -133,6 +136,9 @@ class HeatingControlRecommender:
self.recommendation.append(
{
+ "phase": phase,
+ "type": "heating",
+ "measure_type": "celect_type_controls",
"description": "Upgrade heating controls to High Heat Retention Storage Heater Controls",
**self.costs.celect_type_controls(),
"simulation_config": simulation_config,
@@ -143,7 +149,7 @@ class HeatingControlRecommender:
# We don't implement any other recommendations right now
return
- def recommend_roomstat_programmer_trvs(self, description_suffix=""):
+ def recommend_roomstat_programmer_trvs(self, phase, description_suffix=""):
"""
If the home has a boiler and radiators, mains gas, we start by identifying potential heating controls that could
be upgraded, that would provide a practical impact.
@@ -208,15 +214,16 @@ class HeatingControlRecommender:
description = "Upgrade heating controls to Room thermostat, programmer and TRVs"
- already_installed = "heating_control" in self.property.already_installed
+ already_installed = "roomstat_programmer_trvs" in self.property.already_installed
if already_installed:
cost_result = override_costs(cost_result)
description = "Heating controls have already been upgraded, no further action needed."
self.recommendation.append(
{
- "type": "heating_control",
+ "type": "heating",
"measure_type": "roomstat_programmer_trvs",
+ "phase": phase,
"parts": [],
"description": description,
**cost_result,
@@ -231,7 +238,7 @@ class HeatingControlRecommender:
return
- def recommend_time_temperature_zone_controls(self, description_suffix=""):
+ def recommend_time_temperature_zone_controls(self, phase, description_suffix=""):
"""
If the home has a boiler, we can recommend time and temperature zone controls. This is a more advanced
and more efficient control system than the standard controls that come with a boiler. However, it may come
@@ -282,14 +289,15 @@ class HeatingControlRecommender:
"temperature zone control)"
)
- already_installed = "heating_control" in self.property.already_installed
+ already_installed = "time_temperature_zone_control" in self.property.already_installed
if already_installed:
cost_result = override_costs(cost_result)
description = "Heating controls have already been upgraded, no further action needed."
self.recommendation.append(
{
- "type": "heating_control",
+ "type": "heating",
+ "phase": phase,
"measure_type": "time_temperature_zone_control",
"parts": [],
"description": description,
@@ -335,14 +343,15 @@ class HeatingControlRecommender:
description = "Install a Bypass valve, TRVs and a Programmer"
- already_installed = "heating_control" in self.property.already_installed
+ already_installed = "programmer_trvs_bypass" in self.property.already_installed
if already_installed:
cost_result = override_costs(cost_result)
description = "Heating controls have already been upgraded, no further action needed."
self.recommendation.append(
{
- "type": "heating_control",
+ "type": "heating",
+ "measure_type": "programmer_trvs_bypass",
"parts": [],
"description": description,
**cost_result,
diff --git a/recommendations/HeatingRecommender.py b/recommendations/HeatingRecommender.py
index 7dc4f8b2..20f5e7ad 100644
--- a/recommendations/HeatingRecommender.py
+++ b/recommendations/HeatingRecommender.py
@@ -65,7 +65,6 @@ class HeatingRecommender:
self.costs = Costs(self.property)
self.heating_recommendations = []
- self.heating_control_recommendations = []
self.has_electric_heating_description = (
self.property.main_heating["has_electric"] or self.property.main_heating["has_electricaire"]
@@ -259,7 +258,6 @@ class HeatingRecommender:
"ashp_only_heating_recommendation", False
)
self.heating_recommendations = []
- self.heating_control_recommendations = []
# This first iteration of the recommender will provide very basic recommendation
# We recommend heating controls based on the main heating system
@@ -302,7 +300,6 @@ class HeatingRecommender:
self.recommend_air_source_heat_pump(
phase=phase,
has_cavity_or_loft_recommendations=has_cavity_or_loft_recommendations,
-
)
return
@@ -360,7 +357,7 @@ class HeatingRecommender:
}
controls_recommender = HeatingControlRecommender(self.property)
- controls_recommender.recommend(heating_description="Boiler and radiators, electric")
+ controls_recommender.recommend(heating_description="Boiler and radiators, electric", phase=phase)
self.heating_recommendations.extend([boiler_recommendation] + controls_recommender.recommendation)
return
@@ -453,7 +450,7 @@ class HeatingRecommender:
), {})
controls_recommender = HeatingControlRecommender(self.property)
- controls_recommender.recommend(heating_description="Air source heat pump, radiators, electric")
+ controls_recommender.recommend(heating_description="Air source heat pump, radiators, electric", phase=phase)
ashp_size = self.size_heat_pump()
ashp_costs = self.costs.air_source_heat_pump(ashp_size)
@@ -631,7 +628,8 @@ class HeatingRecommender:
heating_controls_only,
system_change,
system_type,
- measure_type
+ measure_type,
+ non_intrusive_recommendation=None
):
"""
Given a recommendation for heating controls, and a recommendation for the heating system, we combine the two
@@ -649,8 +647,13 @@ class HeatingRecommender:
:param system_type: The type of heating system we are recommending
:param measure_type: The type of measure we are recommending - more granular than the "type" field, allowing us
to distinguish between different types of heating recommendations
+ :param non_intrusive_recommendation: A non-intrusive recommendation, which may specify the number of SAP points
+ or a cost for this recommendation
"""
+ if non_intrusive_recommendation is None:
+ non_intrusive_recommendation = {}
+
# We produce recommendations with & without heating controls
# We will also produce a recommendation for heating controls only
heating_controls_switch = [True, False] if controls_recommendations else [False]
@@ -698,13 +701,14 @@ class HeatingRecommender:
"description": recommendation_description,
"starting_u_value": None,
"new_u_value": None,
- "sap_points": None,
+ "sap_points": non_intrusive_recommendation.get("sap_points"),
"already_installed": already_installed,
**total_costs,
"simulation_config": recommendation_simulation_config,
"description_simulation": recommendation_description_simulation,
# We insert the heating system type here
- "system_type": system_type
+ "system_type": system_type,
+ "survey": non_intrusive_recommendation.get("survey", False)
}
output.append(recommendation)
@@ -798,7 +802,9 @@ class HeatingRecommender:
description_prefix = ""
controls_recommender.recommend(
- heating_description="Electric storage heaters", description_prefix=description_prefix
+ heating_description="Electric storage heaters",
+ description_prefix=description_prefix,
+ phase=phase
)
has_hhr = self.is_hhr_already_installed()
@@ -807,6 +813,13 @@ class HeatingRecommender:
# No recommendation needed
return
+ # We check if there is a high heat retention non-intrusive recommendation
+ non_intrusive_recommendation = next(
+ (r for r in self.property.non_invasive_recommendations if
+ r["type"] == "high_heat_retention_storage_heater"),
+ {}
+ )
+
# We check if the property has dual heating in place with a boiler and storage heaters
if self.dual_heating:
new_heating_description = self.DUAL_HEATING_DESCRIPTIONS[
@@ -838,6 +851,8 @@ class HeatingRecommender:
else:
heating_simulation_config["mainheat_energy_eff_ending"] = self.property.data["mainheat-energy-eff"]
+ # TODO:We possibly shouldn't touch the hot water energy efficiency if we aren't recommending dual immersion
+ # we'll keep this for the moment though
if self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor"]:
heating_simulation_config["hot_water_energy_eff_ending"] = "Average"
else:
@@ -895,7 +910,8 @@ class HeatingRecommender:
heating_controls_only=heating_controls_only,
system_change=system_change,
system_type="high_heat_retention_storage_heater",
- measure_type="high_heat_retention_storage_heater"
+ measure_type="high_heat_retention_storage_heater",
+ non_intrusive_recommendation=non_intrusive_recommendation
)
if _return:
return recommendations
@@ -978,9 +994,13 @@ class HeatingRecommender:
# We check if there's a mains connection and the hot water is inefficient, as this will improve with a boiler
has_inefficient_water = (
self.property.data["mains-gas-flag"] and
- self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor", "Average"]
+ self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor"]
)
+ non_invasive_recommendation = next((
+ r for r in self.property.non_invasive_recommendations if r["type"] == "boiler_upgrade"
+ ), {})
+
if has_inefficient_space_heating or has_inefficient_water:
boiler_size = self.estimate_boiler_size(
property_type=self.property.data["property-type"],
@@ -1079,12 +1099,13 @@ class HeatingRecommender:
"description": description,
"starting_u_value": None,
"new_u_value": None,
- "sap_points": None,
+ "sap_points": non_invasive_recommendation.get("sap_points", None),
"already_installed": already_installed,
"simulation_config": simulation_config,
"description_simulation": description_simulation,
**boiler_costs,
"system_type": "boiler_upgrade",
+ "survey": non_invasive_recommendation.get("survey", None)
}
# We recommend the heating controls
@@ -1098,10 +1119,10 @@ class HeatingRecommender:
description_suffix = ""
controls_recommender.recommend(
heating_description="Boiler and radiators, mains gas",
- description_suffix=description_suffix
+ description_suffix=description_suffix,
+ phase=recommendation_phase
)
# We may have 2 recommendations from the heating controls
-
if not controls_recommender.recommendation and not boiler_recommendation:
return
@@ -1111,6 +1132,8 @@ class HeatingRecommender:
if system_change:
# We combine the heating and controls recommendations, in the case of a system change
+ # If this is true, we set SAP points to None and survey to False for the boiler recommendation
+
combined_recommendations = []
for controls_recommendation in controls_recommender.recommendation:
combined_recommendation = self.combine_heating_and_controls(
@@ -1137,10 +1160,6 @@ class HeatingRecommender:
# 3) Heating controls only
# But they are options that are not mutually exclusive
# So, we actually set heating controls as a heating recommendation
- for recommendation in controls_recommender.recommendation:
- recommendation["phase"] = recommendation_phase
- # recommendation["type"] = "heating"
-
- self.heating_control_recommendations.extend(controls_recommender.recommendation)
+ self.heating_recommendations.extend(controls_recommender.recommendation)
return
diff --git a/recommendations/HotwaterRecommendations.py b/recommendations/HotwaterRecommendations.py
index 636a7be0..d8404cc1 100644
--- a/recommendations/HotwaterRecommendations.py
+++ b/recommendations/HotwaterRecommendations.py
@@ -20,26 +20,66 @@ class HotwaterRecommendations:
:return:
"""
# Reset the recommendations
+ recommendations_phase = phase
+
self.recommendations = []
+ non_invasive_recommendations = self.property.non_invasive_recommendations
+ if non_invasive_recommendations:
+ measures = [
+ r["type"] for r in non_invasive_recommendations if
+ r["type"] in ["hot_water_tank_insulation", "cylinder_thermostat"]
+ ]
+
+ for m in measures:
+ non_invasive_rec = [
+ r for r in non_invasive_recommendations if r["type"] == m
+ ][0]
+ if m == "hot_water_tank_insulation":
+ # We need to be able to stack these recommendations
+ self.recommend_tank_insulation(
+ phase=recommendations_phase,
+ sap_points=non_invasive_rec["sap_points"],
+ survey=non_invasive_rec["survey"],
+ )
+
+ recommendations_phase += 1
+ elif m == "cylinder_thermostat":
+ self.recommend_cylinder_thermostat(
+ phase=recommendations_phase,
+ sap_points=non_invasive_rec["sap_points"],
+ survey=non_invasive_rec["survey"],
+ )
+ recommendations_phase += 1
# This first iteration of the recommender will provide very basic recommendation
# We recommend heating controls based on the main heating system
- # If there is no system present, but access to the mains, we
+ if self.property.hotwater["clean_description"] == "Gas boiler/circulator, no cylinder thermostat":
+ # Handle this case specifically:
+ self.recommend_cylinder_thermostat_gas_boiler_circulator(phase=recommendations_phase)
+ return
+
+ # If there is no system present, but access to the mains, we
+
+ has_tank_recommendation = [r for r in self.recommendations if r["type"] == "hot_water_tank_insulation"]
if (
(self.property.hotwater["heater_type"] in ["electric immersion"]) &
(self.property.data["hot-water-energy-eff"] == "Very Poor") &
- (self.property.hotwater["no_system_present"] is None)
+ (self.property.hotwater["no_system_present"] is None) &
+ (len(has_tank_recommendation) == 0)
):
- self.recommend_tank_insulation(phase=phase)
+ self.recommend_tank_insulation(phase=recommendations_phase)
return
- if self.property.hotwater["clean_description"] == "From main system, no cylinder thermostat":
- self.recommend_cylinder_thermostat(phase=phase)
+ has_cylinder_recommendation = [r for r in self.recommendations if r["type"] == "cylinder_thermostat"]
+
+ if ((self.property.hotwater["clean_description"] == "From main system, no cylinder thermostat") &
+ (len(has_cylinder_recommendation) == 0)):
+ self.recommend_cylinder_thermostat(phase=recommendations_phase)
return
- def recommend_tank_insulation(self, phase):
+ def recommend_tank_insulation(self, phase, sap_points=None, survey=False, _return=False):
"""
If the home has a very poor hot water system, this is often indicative of a lack of insulation on the hot water
tank. This is a very simple and cost effective improvement that can be made to the home. It will likely
@@ -55,27 +95,30 @@ class HotwaterRecommendations:
else:
description = "Insulate hot water tank"
- self.recommendations.append(
- {
- "phase": phase,
- "parts": [],
- "type": "hot_water_tank_insulation",
- "measure_type": "hot_water_tank_insulation",
- "description": description,
- "starting_u_value": None,
- "new_u_value": None,
- "sap_points": None,
- "already_installed": already_installed,
- **recommendation_cost,
- "simulation_config": {"hot_water_energy_eff_ending": "Poor"},
- "description_simulation": {
- "hot-water-energy-eff": "Poor"
- }
- }
- )
+ to_append = {
+ "phase": phase,
+ "parts": [],
+ "type": "hot_water_tank_insulation",
+ "measure_type": "hot_water_tank_insulation",
+ "description": description,
+ "starting_u_value": None,
+ "new_u_value": None,
+ "sap_points": sap_points,
+ "already_installed": already_installed,
+ **recommendation_cost,
+ "simulation_config": {"hot_water_energy_eff_ending": "Poor"},
+ "description_simulation": {
+ "hot-water-energy-eff": "Poor"
+ },
+ "survey": survey
+ }
+ if _return:
+ return to_append
+
+ self.recommendations.append(to_append)
return
- def recommend_cylinder_thermostat(self, phase):
+ def recommend_cylinder_thermostat(self, phase, sap_points=None, survey=False, _return=False):
"""
If the home has a very poor hot water system, this is often indicative of a lack of insulation on the hot water
tank. This is a very simple and cost effective improvement that can be made to the home.
@@ -101,23 +144,86 @@ class HotwaterRecommendations:
**hotwater_simulation_config
}
- self.recommendations.append(
- {
- "phase": phase,
- "parts": [],
- "type": "cylinder_thermostat",
- "measure_type": "cylinder_thermostat",
- "description": description,
- "starting_u_value": None,
- "new_u_value": None,
- "sap_points": None,
- "already_installed": already_installed,
- **recommendation_cost,
- "simulation_config": simulation_config,
- "description_simulation": {
- "hot-water-energy-eff": self.property.data["hot-water-energy-eff"],
- "hotwater-description": new_epc_description,
- }
- }
- )
+ to_append = {
+ "phase": phase,
+ "parts": [],
+ "type": "cylinder_thermostat",
+ "measure_type": "cylinder_thermostat",
+ "description": description,
+ "starting_u_value": None,
+ "new_u_value": None,
+ "sap_points": sap_points,
+ "already_installed": already_installed,
+ **recommendation_cost,
+ "simulation_config": simulation_config,
+ "description_simulation": {
+ "hot-water-energy-eff": self.property.data["hot-water-energy-eff"],
+ "hotwater-description": new_epc_description,
+ },
+ "survey": survey
+ }
+ if _return:
+ return to_append
+
+ self.recommendations.append(to_append)
+ return
+
+ def recommend_cylinder_thermostat_gas_boiler_circulator(self, phase):
+ """
+ If the home has a very poor hot water system, this is often indicative of a lack of insulation on the
+ hot water
+ tank. This is a very simple and cost effective improvement that can be made to the home.
+ """
+
+ thermostat_recommendation_cost = self.costs.cylinder_thermostat()
+ cylinder_recommendation_cost = self.costs.hot_water_tank_insulation()
+ # Add them
+ total_cost = {
+ k: thermostat_recommendation_cost[k] + cylinder_recommendation_cost[k] for k in
+ thermostat_recommendation_cost.keys()
+ }
+
+ already_installed = "cylinder_thermostat" in self.property.already_installed
+ if already_installed:
+ total_cost = override_costs(total_cost)
+ description = "Cylinder thermostat & insulation has already been installed, no further action required"
+ else:
+ description = "Install a smart cylinder thermostat and insulate the hot water tank with 80mm insulation"
+
+ new_epc_description = "From main system"
+ hotwater_ending_config = HotWaterAttributes(new_epc_description).process()
+ hotwater_simulation_config = check_simulation_difference(
+ new_config=hotwater_ending_config, old_config=self.property.hotwater
+ )
+
+ if self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor", "Average"]:
+ new_efficiency = "Good"
+ else:
+ new_efficiency = self.property.data["hot-water-energy-eff"]
+
+ simulation_config = {
+ "hot_water_energy_eff_ending": new_efficiency,
+ **hotwater_simulation_config
+ }
+
+ to_append = {
+ "phase": phase,
+ "parts": [],
+ "type": "cylinder_thermostat",
+ "measure_type": "cylinder_thermostat",
+ "description": description,
+ "starting_u_value": None,
+ "new_u_value": None,
+ "sap_points": None,
+ "already_installed": already_installed,
+ **total_cost,
+ "simulation_config": simulation_config,
+ "description_simulation": {
+ "hot-water-energy-eff": simulation_config["hot_water_energy_eff_ending"],
+ "hotwater-description": new_epc_description,
+ },
+ "survey": False
+ }
+
+ self.recommendations.append(to_append)
return
diff --git a/recommendations/LightingRecommendations.py b/recommendations/LightingRecommendations.py
index f9a1d63a..3447394d 100644
--- a/recommendations/LightingRecommendations.py
+++ b/recommendations/LightingRecommendations.py
@@ -4,6 +4,7 @@ from backend.Property import Property
from typing import List
from recommendations.Costs import Costs
from recommendations.recommendation_utils import override_costs
+from backend.ml_models.AnnualBillSavings import AnnualBillSavings
class LightingRecommendations:
@@ -161,6 +162,7 @@ class LightingRecommendations:
# the proportion of lights that will be set to low energy
"sap_points": sap_points,
"kwh_savings": heat_demand_change,
+ "energy_cost_savings": heat_demand_change * AnnualBillSavings.ELECTRICITY_PRICE_CAP,
"co2_equivalent_savings": carbon_change,
"description_simulation": {
"lighting-energy-eff": "Very Good",
diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py
index dd51b47d..0e73cffe 100644
--- a/recommendations/Recommendations.py
+++ b/recommendations/Recommendations.py
@@ -142,19 +142,17 @@ class Recommendations:
# Ventilation recommendations
# We only produce a ventilation recommendation if the property is recommended to have wall or roof
- # insulation
- # We will not attribute a SAP impact to the ventilation recommendation, since we've seen that this
- # has no
- # real impact on the SAP score. Therefore, we don't need to include phasing for ventilation. If we
- # have any
- # wall or roof recommendations, we will ensure that ventilation is included in the simulation
+ # insulation We will not attribute a SAP impact to the ventilation recommendation, since we've seen that this
+ # has no real impact on the SAP score. Therefore, we don't need to include phasing for ventilation. If we
+ # have any wall or roof recommendations, we will ensure that ventilation is included in the simulation
if (
(self.wall_recomender.recommendations or self.roof_recommender.recommendations) and
("ventilation" in measures)
):
- self.ventilation_recomender.recommend()
+ self.ventilation_recomender.recommend(phase=phase)
if self.ventilation_recomender.recommendation:
property_recommendations.append(self.ventilation_recomender.recommendation)
+ phase += 1
if "trickle_vents" in measures:
# This is a recommendatin that typically comes from an energy assessment
@@ -211,27 +209,25 @@ class Recommendations:
measures=measures,
has_cavity_or_loft_recommendations=has_cavity_or_loft_recommendations,
)
- if (
- self.heating_recommender.heating_recommendations or
- self.heating_recommender.heating_control_recommendations
- ):
+ if self.heating_recommender.heating_recommendations:
# We split into first and second phase recommendations
first_phase_recommendations = [
r for r in (
- self.heating_recommender.heating_recommendations +
- self.heating_recommender.heating_control_recommendations
+ self.heating_recommender.heating_recommendations
)
if r["phase"] == phase
]
second_phase_recommendations = [
r for r in (
- self.heating_recommender.heating_recommendations +
- self.heating_recommender.heating_control_recommendations
+ self.heating_recommender.heating_recommendations
)
if r["phase"] == phase + 1
]
+ if first_phase_recommendations and second_phase_recommendations:
+ raise Exception("Imeplement me")
+
if first_phase_recommendations:
property_recommendations.append(first_phase_recommendations)
@@ -243,8 +239,7 @@ class Recommendations:
# otherwise we incremenet by 1
max_used_phase = max(
[rec["phase"] for rec in
- self.heating_recommender.heating_recommendations +
- self.heating_recommender.heating_control_recommendations]
+ self.heating_recommender.heating_recommendations]
)
amount_to_increment = max_used_phase - phase + 1
phase += amount_to_increment
@@ -253,8 +248,13 @@ class Recommendations:
if "hot_water" in measures:
self.hotwater_recommender.recommend(phase=phase)
if self.hotwater_recommender.recommendations:
- property_recommendations.append(self.hotwater_recommender.recommendations)
- phase += 1
+ if len(self.hotwater_recommender.recommendations) > 1:
+ for r in self.hotwater_recommender.recommendations:
+ property_recommendations.append([r])
+ phase += 1
+ else:
+ property_recommendations.append(self.hotwater_recommender.recommendations)
+ phase += 1
if "secondary_heating" in measures:
self.secondary_heating_recommender.recommend(phase=phase)
@@ -304,12 +304,12 @@ class Recommendations:
# want to include the cavity wall insulation recommendation in the defaults
if recommendations_by_type[0].get("type") in [
- "mechanical_ventilation", "trickle_vents", "draught_proofing"
+ "trickle_vents", "draught_proofing"
]:
continue
has_u_value = recommendations_by_type[0].get("new_u_value") is not None
- has_sap_points = recommendations_by_type[0].get("sap_points") is not None
+ has_sap_points = all([r.get("sap_points") is not None for r in recommendations_by_type])
has_rank = recommendations_by_type[0].get("rank") is not None
# When check if these recommendations have two different types, such as solid wall insulation
@@ -447,6 +447,7 @@ class Recommendations:
property_instance,
all_predictions,
recommendations,
+ representative_recommendations,
):
"""
@@ -460,6 +461,7 @@ class Recommendations:
:param property_instance: Instance of the Property class, for the home associated to property_id
:param all_predictions: dictionary of predictions from the model apis
:param recommendations: dictionary of recommendations for the property
+ :param representative_recommendations: dictionary of representative recommendations for the property
:return:
"""
@@ -471,15 +473,20 @@ class Recommendations:
property_recommendations = recommendations[property_instance.id].copy()
+ representative_recs = representative_recommendations[property_instance.id].copy()
+ representative_ids = [r["recommendation_id"] for r in representative_recs]
+
increasing_variables = ["sap"]
decreasing_variables = ["carbon", "heat_demand"]
+ # If the recommendation is mechanical ventilation, we don't apply the rule that the new value should be higher
+ mv_increasing_variables = ["carbon", "heat_demand"]
+ mv_decreasing_variables = ["sap"]
+
impact_summary = []
for recommendations_by_type in property_recommendations:
for rec in recommendations_by_type:
- if rec["type"] in [
- "mechanical_ventilation", "trickle_vents", "draught_proofing", "extension_cavity_wall_insulation"
- ]:
+ if rec["type"] in ["trickle_vents", "draught_proofing", "extension_cavity_wall_insulation"]:
# We don't have a percieved sap impact of mechanical ventilation or trickle vents, and we don't
# have the capacity to score draught proofing
if rec["type"] == "extension_cavity_wall_insulation":
@@ -497,7 +504,9 @@ class Recommendations:
impact_summary.append(
{
"phase": rec["phase"],
+ "representative": rec["recommendation_id"] in representative_ids,
"recommendation_id": rec["recommendation_id"],
+ "measure_type": rec["measure_type"],
"sap": sap + rec["sap_points"],
"carbon": carbon - rec["co2_equivalent_savings"],
"heat_demand": heat_demand - rec["heat_demand"],
@@ -519,15 +528,21 @@ class Recommendations:
# heating_cost_starting and heating_cost_ending are just the values in the EPC. However, with
# heating_cost_ending, we expect that the EPC will predict a heating cost based on what would happen
# if we implemented the recommendation today, so our starting value is the EPC
+
previous_phase_values = {
"sap": float(property_instance.data["current-energy-efficiency"]),
+ # For carbon, even though we generally use the updated figure which includes the carbon
+ # associated to appliances, for this scoring process we use the EPC carbon value. This means
+ # that we don't overestimate the impact since the model uses the EPC carbon value
"carbon": float(property_instance.data["co2-emissions-current"]),
"heat_demand": float(property_instance.data["energy-consumption-current"]),
}
else:
- previous_phase_values_multiple = [x for x in impact_summary if x["phase"] == (rec["phase"] - 1)]
+ previous_phase_values_multiple = [
+ x for x in impact_summary if x["phase"] == (rec["phase"] - 1) and x["representative"]
+ ]
if len(previous_phase_values_multiple) != 1:
# Take an average of each of the previous phases
keys_to_median = ["sap", "carbon", "heat_demand"]
@@ -541,8 +556,13 @@ class Recommendations:
previous_phase_values = previous_phase_values_multiple[0]
# We extract the values for the current phase
+ if rec.get("survey", False):
+ current_phase_sap = rec["sap_points"] + previous_phase_values["sap"]
+ else:
+ current_phase_sap = phase_energy_efficiency_metrics["sap_change"]
+
current_phase_values = {
- "sap": phase_energy_efficiency_metrics["sap_change"],
+ "sap": current_phase_sap,
"carbon": phase_energy_efficiency_metrics["carbon_change"],
"heat_demand": phase_energy_efficiency_metrics["heat_demand"],
}
@@ -552,13 +572,23 @@ class Recommendations:
# For decreasing variables, the new value should be lower than the previous, otherwise we set it to
# the previous
# In either case, we adjudge the recommendation to have had no/negligible impact
- for v in increasing_variables:
+ # However, if the recommendation is mechanical ventilation, this can have a negative SAP impact so
+ # we don't apply this rule
+
+ if rec["type"] == "mechanical_ventilation":
+ phase_increasing_variables = mv_increasing_variables
+ phase_decreasing_variables = mv_decreasing_variables
+ else:
+ phase_increasing_variables = increasing_variables
+ phase_decreasing_variables = decreasing_variables
+
+ for v in phase_increasing_variables:
current_phase_values[v] = (
current_phase_values[v] if current_phase_values[v] > previous_phase_values[v] else
previous_phase_values[v]
)
for v in previous_phase_values:
- if v in decreasing_variables:
+ if v in phase_decreasing_variables:
current_phase_values[v] = (
current_phase_values[v] if current_phase_values[v] < previous_phase_values[v] else
previous_phase_values[v]
@@ -573,13 +603,19 @@ class Recommendations:
"heat_demand": previous_phase_values["heat_demand"] - current_phase_values["heat_demand"],
}
- # Prevent from being negative
+ # Prevent from being negative - apart from ventilation
for metric in ["sap", "carbon", "heat_demand"]:
- property_phase_impact[metric] = (
- 0 if property_phase_impact[metric] < 0 else property_phase_impact[metric]
- )
- if metric == "sap":
- property_phase_impact[metric] = round(property_phase_impact[metric], 2)
+ if rec["type"] != "mechanical_ventilation":
+ property_phase_impact[metric] = (
+ 0 if property_phase_impact[metric] < 0 else property_phase_impact[metric]
+ )
+ if metric == "sap":
+ property_phase_impact[metric] = round(property_phase_impact[metric], 2)
+ else:
+ # We prevent these from being positive
+ property_phase_impact[metric] = (
+ 0 if property_phase_impact[metric] > 0 else property_phase_impact[metric]
+ )
# For the moment, we cap the number of SAP points that can be achieved by LEDs at 2
if rec["type"] == "low_energy_lighting":
@@ -599,11 +635,18 @@ class Recommendations:
# By limiting here, we don't change the value in current_phase_values. This means that the
# future recommendations won't have an impact that is too large
li_sap_limit = RoofRecommendations.get_loft_insulation_sap_limit(
- property_instance.data["roof-energy-eff"], property_instance.data["extension-count"]
+ property_instance.data["roof-energy-eff"], property_instance.roof["insulation_thickness"]
)
if li_sap_limit is not None:
property_phase_impact["sap"] = min(property_phase_impact["sap"], li_sap_limit)
+ if rec["type"] == "solar_pv":
+ # We use the SAP points in the recommendation as a minimum
+ property_phase_impact["sap"] = (
+ rec["sap_points"] if property_phase_impact["sap"] < rec["sap_points"] else
+ property_phase_impact["sap"]
+ )
+
# Insert this information into the recommendation.
if not rec.get("survey", False):
rec["sap_points"] = property_phase_impact["sap"]
@@ -620,7 +663,9 @@ class Recommendations:
impact_summary.append(
{
"phase": rec["phase"],
+ "representative": rec["recommendation_id"] in representative_ids,
"recommendation_id": rec["recommendation_id"],
+ "measure_type": rec["measure_type"],
**current_phase_values
}
)
@@ -628,7 +673,9 @@ class Recommendations:
return property_recommendations, impact_summary
@staticmethod
- def map_descriptions_to_fuel(heating_description, hotwater_description, main_fuel_description):
+ def map_descriptions_to_fuel(
+ heating_description, hotwater_description, main_fuel_description, descriptions_to_fuel_types
+ ):
# Handle the case of community schemes
if (heating_description == "Community scheme") or (hotwater_description == "Community scheme"):
@@ -641,7 +688,7 @@ class Recommendations:
}
raise NotImplementedError("Handle this case")
- mapped = assumptions.DESCRIPTIONS_TO_FUEL_TYPES[heating_description]
+ mapped = descriptions_to_fuel_types[heating_description]
heating_fuel = mapped["fuel"]
if hotwater_description in [
@@ -661,7 +708,7 @@ class Recommendations:
"heating_cop": mapped["cop"], "hotwater_cop": 1
}
- mapped_hotwater = assumptions.DESCRIPTIONS_TO_FUEL_TYPES[hotwater_description]
+ mapped_hotwater = descriptions_to_fuel_types[hotwater_description]
return {
"heating_fuel_type": heating_fuel, "hotwater_fuel_type": mapped_hotwater["fuel"],
@@ -670,17 +717,24 @@ class Recommendations:
@classmethod
def calculate_recommendation_tenant_savings(
- cls, property_instance, kwh_simulation_predictions, property_recommendations
+ cls, property_instance, kwh_simulation_predictions, property_recommendations, ashp_cop=None
):
"""
This method inserts the kwh savings and the bill savings that the customer will make from the recommendations
based on the predictions from the ML model
+
+ It also ensures we base our solar savings and solar carbon savings from the calculations based on
+ the solar API and size of the array, instead of ML model
+
:param property_instance: Instance of the Property class, for the home associated to property_id
:param kwh_simulation_predictions: dictionary of predictions from the model apis
:param property_recommendations: dictionary of recommendations for the property
+ :param ashp_cop: The coefficient of performance for the air source heat pump.
:return:
"""
+ ashp_cop = ashp_cop if ashp_cop else assumptions.AVERAGE_ASHP_EFFICIENCY
+
kwh_impact_table = kwh_simulation_predictions["heating_kwh_predictions"][
kwh_simulation_predictions["heating_kwh_predictions"]["property_id"] == str(property_instance.id)
].merge(
@@ -739,22 +793,42 @@ class Recommendations:
]
).sort_values(["phase", "recommendation_id"], ascending=True).reset_index(drop=True)
+ # We need the recommendaion type
+ rec_id_to_type = {
+ rec["recommendation_id"]: rec["type"] for recs in property_recommendations for rec in recs
+ }
+ rec_id_to_type[STARTING_DUMMY_ID_VALUE] = "starting_dummy"
+
for i in range(0, len(kwh_impact_table)):
- current_phase = kwh_impact_table.loc[i, 'phase']
+ current = kwh_impact_table.loc[i]
+ current_phase = current['phase']
previous_phase_id = (current_phase - 1) if (current_phase > 0) else -9999
previous_phase = kwh_impact_table[kwh_impact_table['phase'] == previous_phase_id]
if not previous_phase.empty:
for col in ["predictions_heating", "predictions_hotwater"]:
+ # Check if the recommendation type is ventilation
+ if rec_id_to_type[current["recommendation_id"]] == "mechanical_ventilation":
+ # We expect the kwh to increase
+ if kwh_impact_table.loc[i, col] > previous_phase[col].max():
+ continue
+
if kwh_impact_table.loc[i, col] > previous_phase[col].max():
kwh_impact_table.loc[i, col] = previous_phase[col].max()
+ descriptions_to_fuel_types = assumptions.DESCRIPTIONS_TO_FUEL_TYPES
+ # We will the air source heat pump efficiencies
+ ashp_keys = [k for k in descriptions_to_fuel_types.keys() if "air source heat pump" in k.lower()]
+ for k in ashp_keys:
+ descriptions_to_fuel_types[k]["cop"] = ashp_cop
+
# For heating system recommendations, this could result in a fuel type change so we reflect that
fuel_mapping = pd.DataFrame([
{
"id": epc["id"],
**cls.map_descriptions_to_fuel(
- epc["mainheat-description"], epc["hotwater-description"], epc["main-fuel"]
+ epc["mainheat-description"], epc["hotwater-description"], epc["main-fuel"],
+ descriptions_to_fuel_types
)
} for epc in property_instance.updated_simulation_epcs
])
@@ -768,7 +842,8 @@ class Recommendations:
**cls.map_descriptions_to_fuel(
property_instance.data["mainheat-description"],
property_instance.data["hotwater-description"],
- property_instance.data["main-fuel"]
+ property_instance.data["main-fuel"],
+ descriptions_to_fuel_types
)
}
]
@@ -797,7 +872,7 @@ class Recommendations:
for recs in property_recommendations:
for rec in recs:
if rec["type"] in [
- "mechanical_ventilation", "trickle_vents", "draught_proofing", "extension_cavity_wall_insulation"
+ "trickle_vents", "draught_proofing", "extension_cavity_wall_insulation"
]:
# We cannot score the impact on draught proofing
continue
@@ -808,6 +883,12 @@ class Recommendations:
if rec["type"] == "solar_pv":
rec["kwh_savings"] = rec_impact["solar_kwh_savings"].values[0]
+
+ # Calculate carbon savings from this - emissions in kg and convert to tonnes
+ emissions_kg = rec["kwh_savings"] * assumptions.ELECTRICITY_CARBON_INTENSITY
+ emissions_tonnes = emissions_kg / 1000
+
+ rec["co2_equivalent_savings"] = emissions_tonnes
rec["energy_cost_savings"] = (
rec_impact["solar_kwh_savings"].values[0] * AnnualBillSavings.ELECTRICITY_PRICE_CAP
)
@@ -816,13 +897,18 @@ class Recommendations:
heating_kwh_savings = (
previous_phase_impact["predictions_heating"].mean() - rec_impact["predictions_heating"].values[0]
)
- heating_cost_savings = (
- previous_phase_impact["heating_cost"].mean() - rec_impact["heating_cost"].values[0]
- )
-
hotwater_kwh_savings = (
previous_phase_impact["predictions_hotwater"].mean() - rec_impact["predictions_hotwater"].values[0]
)
+
+ # Shouldn't be positive
+ if rec["type"] == "mechanical_ventilation":
+ heating_kwh_savings = 0 if heating_kwh_savings > 0 else heating_kwh_savings
+ hotwater_kwh_savings = 0 if hotwater_kwh_savings > 0 else hotwater_kwh_savings
+
+ heating_cost_savings = (
+ previous_phase_impact["heating_cost"].mean() - rec_impact["heating_cost"].values[0]
+ )
hotwater_host = (
previous_phase_impact["hotwater_cost"].mean() - rec_impact["hotwater_cost"].values[0]
)
@@ -830,9 +916,8 @@ class Recommendations:
total_kwh_savings = heating_kwh_savings + hotwater_kwh_savings
energy_cost_savings = heating_cost_savings + hotwater_host
- if rec["type"] == "lighting":
- # In this case, we should probably just SKIP but check when we have one!
- raise Exception("Implement me 3")
+ if rec["type"] == "low_energy_lighting":
+ continue
rec["kwh_savings"] = total_kwh_savings
rec["energy_cost_savings"] = energy_cost_savings
diff --git a/recommendations/RoofRecommendations.py b/recommendations/RoofRecommendations.py
index c0fa4eb2..cd7f82c4 100644
--- a/recommendations/RoofRecommendations.py
+++ b/recommendations/RoofRecommendations.py
@@ -52,6 +52,10 @@ class RoofRecommendations:
part for part in materials if part["type"] == "flat_roof_insulation"
]
+ self.room_roof_insulation_materials = [
+ part for part in materials if part["type"] == "room_roof_insulation"
+ ]
+
# Extract the insulation thickness from the roof, which is used throughout this method
self.insulation_thickness = convert_thickness_to_numeric(
self.property.roof["insulation_thickness"],
@@ -60,16 +64,16 @@ class RoofRecommendations:
)
@classmethod
- def get_loft_insulation_sap_limit(cls, roof_energy_eff, extension_count):
+ def get_loft_insulation_sap_limit(cls, roof_energy_eff, existing_thickness):
"""
Get the SAP limit for loft insulation
:param roof_energy_eff:
:return:
"""
- if extension_count == 0:
- # No limit
- return None
+ if str(existing_thickness).isdigit():
+ if float(existing_thickness) >= 250:
+ return 0
if roof_energy_eff in ["Good", "Very Good"]:
return 1
@@ -123,7 +127,11 @@ class RoofRecommendations:
self.property.roof["insulation_thickness"] in ["average", "above_average"]
)
- return full_insulated_room_roof or room_roof_insulated_at_rafters
+ has_non_invasive_recommendation = any(
+ x["type"] == "room_roof_insulation" for x in self.property.non_invasive_recommendations
+ )
+
+ return (full_insulated_room_roof or room_roof_insulated_at_rafters) and not has_non_invasive_recommendation
def recommend(self, phase, measures=None, default_u_values=False):
@@ -134,6 +142,10 @@ class RoofRecommendations:
u_value = self.property.roof["thermal_transmittance"]
+ # If we have a flat roof but we don't have flat roof as a measure, we exit
+ if self.property.roof["is_flat"] and "flat_roof_insulation" not in measures:
+ return
+
# We check if the roof is already insulated and if so, we exit
# Building regulations part L recommend installing at least 270mm of insulation, however generally we
@@ -148,6 +160,9 @@ class RoofRecommendations:
if self.is_room_roof_insulated_or_unsuitable(measures):
return
+ if self.property.roof["is_thatched"]:
+ return
+
# If we have a u-value already, need to implement this
if u_value:
if u_value <= self.BUILDING_REGULATIONS_PART_L_MAX_U_VALUE:
@@ -181,7 +196,8 @@ class RoofRecommendations:
# We firstly handle non-intrusive recommendations, which may override the normal roof insulation recommendations
if ("loft_insulation" in [x["type"] for x in non_invasive_recommendations]) or (
- self.property.roof["is_pitched"] and "loft_insulation" in measures
+ self.property.roof["is_pitched"] and "loft_insulation" in measures and
+ not self.property.roof["is_at_rafters"]
):
self.recommend_roof_insulation(
u_value=u_value,
@@ -282,6 +298,11 @@ class RoofRecommendations:
insulation_materials = pd.DataFrame(insulation_materials)
+ non_invasive_recommendations = next(
+ (r for r in self.property.non_invasive_recommendations if
+ r["type"] == insulation_materials["type"].values[0]), {}
+ )
+
lowest_selected_u_value = None
recommendations = []
for _, insulation_material_group in insulation_materials.groupby("description"):
@@ -421,14 +442,15 @@ class RoofRecommendations:
"description": self.make_roof_insulation_description(material),
"starting_u_value": u_value,
"new_u_value": new_u_value,
- "sap_points": None,
+ "sap_points": non_invasive_recommendations.get("sap_points", 0),
"already_installed": already_installed,
"simulation_config": simulation_config,
"description_simulation": {
"roof-description": new_description,
"roof-energy-eff": new_efficiency
},
- **cost_result
+ **cost_result,
+ "survey": non_invasive_recommendations.get("survey", False)
}
)
@@ -478,28 +500,22 @@ class RoofRecommendations:
:return:
"""
- # TODO: We temporarilty use costs from SCIS for RIR insulation. The costing was £180/m2 floor
- roof_roof_insulation_materials = [
- {
- "type": "room_roof_insulation",
- "description": "Insulating the ceiling of the roof roof and re-decorate",
- "depths": [100],
- "depth_unit": "mm",
- "r_value_per_mm": 0.038,
- "thermal_conductivity": 0.022,
- "cost": [180],
- }
- ]
+ # We have a list of materials that can be used for room roof insulation
+ # We will iterate over these materials and recommend them based on the current u-value of the roof
+ # and the cost of the materials
rir_non_invasive_recommendation = next(
(x for x in self.property.non_invasive_recommendations if x["type"] == "room_roof_insulation"), {}
)
+ insulation_materials = pd.DataFrame(self.room_roof_insulation_materials)
+
# lowest_selected_u_value = None
recommendations = []
- for material in roof_roof_insulation_materials:
- for depth, cost_per_unit in zip(material["depths"], material["cost"]):
- part_u_value = r_value_per_mm_to_u_value(depth, material["r_value_per_mm"])
+ for _, material_group in insulation_materials.groupby("description"):
+ for material in material_group.itertuples():
+
+ part_u_value = r_value_per_mm_to_u_value(material.depth, material.r_value_per_mm)
_, new_u_value = calculate_u_value_uplift(u_value, part_u_value)
new_u_value = math.ceil(new_u_value * 100.0) / 100.0
@@ -507,13 +523,11 @@ class RoofRecommendations:
# We allow a small tolerance for error so we don't discount the recommendation entirely
estimated_cost = (
- cost_per_unit * self.property.insulation_floor_area if
+ material.total_cost * self.property.insulation_floor_area if
rir_non_invasive_recommendation.get("cost") is None else
rir_non_invasive_recommendation.get("cost")
)
- sap_points = rir_non_invasive_recommendation.get("sap_points", None)
-
# Could also be Roof room(s), ceiling insulated
new_descriptin = "Roof room(s), insulated"
roof_ending_config = RoofAttributes(new_descriptin).process()
@@ -562,7 +576,7 @@ class RoofRecommendations:
"description": "Insulate room in roof at rafters and re-decorate",
"starting_u_value": u_value,
"new_u_value": new_u_value,
- "sap_points": sap_points,
+ "sap_points": rir_non_invasive_recommendation.get("sap_points", None),
"simulation_config": simulation_config,
"description_simulation": {
"roof-description": new_descriptin,
diff --git a/recommendations/SecondaryHeating.py b/recommendations/SecondaryHeating.py
index 7c20bcdd..e63951d9 100644
--- a/recommendations/SecondaryHeating.py
+++ b/recommendations/SecondaryHeating.py
@@ -9,12 +9,6 @@ class SecondaryHeating:
system.
"""
- # The list of existing heating systems that are accepted
- ACCEPTED_MAINHEAT_DESCRIPTIONS = ["Boiler and radiators, mains gas"]
- ACCEPTED_SECONDHEAT_DESCRIPTIONS = ["Room heaters, electric"]
- # These are the heaters where works are required to remove them
- FIXED_HEATER_DESCRIPTIONS = ["Room heaters, electric"]
-
def __init__(self, property_instance: Property):
self.property = property_instance
self.costs = Costs(self.property)
@@ -25,18 +19,10 @@ class SecondaryHeating:
# Reset
self.recommendation = []
- if self.property.main_heating["clean_description"] not in self.ACCEPTED_MAINHEAT_DESCRIPTIONS:
- return
-
- # TODO: We need to clean secondary data
- if self.property.data['secondheat-description'] not in self.ACCEPTED_SECONDHEAT_DESCRIPTIONS:
- return
-
- if self.property.data['secondheat-description'] in self.FIXED_HEATER_DESCRIPTIONS:
- # We have an associated cost otherwise, there is no cost
- n_rooms = self.property.data['number-heated-rooms']
+ if self.property.data['number-habitable-rooms'] > self.property.data['number-heated-rooms']:
+ n_rooms = self.property.data['number-habitable-rooms'] - self.property.data['number-heated-rooms']
else:
- n_rooms = 0
+ n_rooms = self.property.data["number-heated-rooms"]
costs = self.costs.heater_removal(n_rooms=n_rooms)
diff --git a/recommendations/SolarPvRecommendations.py b/recommendations/SolarPvRecommendations.py
index 66c1d0c3..ee07ff28 100644
--- a/recommendations/SolarPvRecommendations.py
+++ b/recommendations/SolarPvRecommendations.py
@@ -1,24 +1,39 @@
import numpy as np
import pandas as pd
+import backend.app.assumptions as assumptions
from recommendations.Costs import Costs
from recommendations.recommendation_utils import override_costs, estimate_pitched_roof_area
class SolarPvRecommendations:
- # Solar panel specs based on Eurener 400s solar panels
- # https://midsummerwholesale.co.uk/buy/eurener/eurener-400w-mepv-zebra-ab-half-cut-mono
- # Approximate area of the solar panels
- SOLAR_PANEL_AREA = 1.79
- # Wattage per panel - this is based on the average wattage of a solar panel being between 250w and 420w
- # This was previously set to 250w, but has been upped to 400 based on the systems used by Cotswolrd Energy Group
- SOLAR_PANEL_WATTAGE = 400
-
+ # For domestic properties, we don't recommend a solar PV system with wattage outside of these
+ # bounds
MAX_SYSTEM_WATTAGE = 6000
MIN_SYSTEM_WATTAGE = 1000
+ # the maximum area of root we allow to be covered in solar panels for our recommendations.
MAX_ROOF_AREA_PERCENTAGE = 0.7
+ SAP_POINTS_PER_5_PERCENT_ROOF_COVERAGE = 1
+
+ BACKUP_PANEL_PERFORMANCE = pd.DataFrame(
+ [
+ {
+ "n_panels": 4,
+ "array_wattage": 1600,
+ "initial_ac_kwh_per_year": assumptions.MEDIAN_WATTAGE_TO_AC * 1600,
+ "panneled_roof_area": 4 * assumptions.RDSAP_AREA_PER_PANEL
+ },
+ {
+ "n_panels": 8,
+ "array_warrage": 3200,
+ "initial_ac_kwh_per_year": assumptions.MEDIAN_WATTAGE_TO_AC * 3200,
+ "panneled_roof_area": 8 * assumptions.RDSAP_AREA_PER_PANEL
+ },
+ ]
+ )
+
def __init__(self, property_instance):
"""
:param property_instance: Instance of the Property class, for the home associated to property_id
@@ -42,46 +57,6 @@ class SolarPvRecommendations:
return trimmed_list
- def mds_recommend(self, phase=None, solar_pv_percentage=0.5):
- # For specific usage within the mds report
-
- solar_pv_roof_area = self.property.get_solar_pv_roof_area(solar_pv_percentage)
-
- number_solar_panels = np.floor(solar_pv_roof_area / self.SOLAR_PANEL_AREA)
- solar_panel_wattage = number_solar_panels * self.SOLAR_PANEL_WATTAGE
-
- solar_panel_wattage = np.clip(
- a=solar_panel_wattage, a_min=self.MIN_SYSTEM_WATTAGE, a_max=self.MAX_SYSTEM_WATTAGE
- )
-
- # We now have a property which is potentially suitable for solar PV
- roof_coverage_percent = round(solar_pv_percentage * 100)
- # Given the wattage, we estimate the cost of the solar PV system. This is based on the MCS database
- # of solar PV installations
- cost_result = self.costs.solar_pv(wattage=solar_panel_wattage, has_battery=False)
- kw = np.floor(solar_panel_wattage / 100) / 10
-
- description = (f"Install a {kw} kilowatt-peak (kWp) solar photovoltaic (PV) p"
- f"anel system on {round(roof_coverage_percent)}% the roof.")
-
- return [
- {
- "phase": phase,
- "parts": [],
- "type": "solar_pv",
- "description": description,
- "starting_u_value": None,
- "new_u_value": None,
- "sap_points": None,
- "already_installed": False,
- **cost_result,
- # This is required for simulating the SAP impact. solar_pv_percentage is between 0 & 1 so we scale
- # back up here
- "photo_supply": roof_coverage_percent,
- "has_battery": False
- }
- ]
-
def recommend_building_analysis(self, phase):
"""
This recommendation approach handles the case of producing solar PV recommendations at the building level,
@@ -103,13 +78,22 @@ class SolarPvRecommendations:
for rank, recommendation_config in best_configurations.iterrows():
# If we dont have the panneled_roof_area in the recommendation_config we calculate it
if recommendation_config.get("panneled_roof_area", None):
- roof_coverage_percent = round(recommendation_config["panneled_roof_area"] / total_roof_area * 100)
+ # We spread the coverage across the individual units
+ roof_coverage_percent = round(
+ ((recommendation_config["panneled_roof_area"] / total_roof_area) * 100) / n_units
+ )
else:
raise Exception("IMPLEMENT ME")
+
+ n_floors = (
+ self.property.number_of_storeys["number_of_storeys"] if
+ self.property.number_of_storeys["number_of_storeys"] is not None else 3
+ )
+
total_cost = self.costs.solar_pv(
array_cost=recommendation_config.get("cost", None),
n_panels=recommendation_config["n_panels"],
- n_floors=self.property.number_of_storeys["number_of_storeys"],
+ n_floors=n_floors,
needs_inverter=True,
)["total"] / n_units
@@ -203,6 +187,20 @@ class SolarPvRecommendations:
roof_coverage_percent = round(recommendation_config["panneled_roof_area"] / roof_area * 100)
# We round up to the nearest 5
roof_coverage_percent = np.ceil(roof_coverage_percent / 5) * 5
+
+ # Typically, we've observed that every 5% of additional roof coverage will result in at least
+ # an additional 1 SAP points (though often 2 points) Given this, we can add a reasonable minimum
+ # for the number of SAP points we might expect. We've observed that for some cases where properties
+ # are hitting the higher SAP scores (e.g. EPC A and above), the model can sometimes under-predict
+ # the number of SAP points. This appears to be due to a relatively small number of properties
+ # actually achieving the upper echelons of EPC rating. This can be the case if we're simulating a
+ # whole house retrofit where the home is getting complete insulation, a heat pump and solar panels.
+ # Because panels are the final recommendation, they are often the measure that takes the home
+ # into the medium to high EPC A ranges and so because of a lack of training data, this means that
+ # we might sometime under-predict. This minimum is intended to try and reduce the negative impact
+ # of this. This minimum is used in Recommendations.calculate_recommendation_impact
+ minimum_sap_points = (roof_coverage_percent / 5) * self.SAP_POINTS_PER_5_PERCENT_ROOF_COVERAGE
+
for has_battery in [False, True]:
cost_result = self.costs.solar_pv(
has_battery=has_battery,
@@ -212,11 +210,14 @@ class SolarPvRecommendations:
)
kw = np.floor(recommendation_config["array_wattage"] / 100) / 10
if has_battery:
- description = (f"Install a {kw} kilowatt-peak (kWp) solar photovoltaic (PV) panel system on "
- f"{round(roof_coverage_percent)}% the roof, with a battery storage system.")
+ description = (
+ f"Install a {kw} kilowatt-peak (kWp) solar panel system, with a battery."
+ )
else:
- description = (f"Install a {kw} kilowatt-peak (kWp) solar photovoltaic (PV) p"
- f"anel system on {round(roof_coverage_percent)}% the roof.")
+ description = f"Install a {kw} kilowatt-peak (kWp) solar panel system."
+
+ if self.property.in_conservation_area:
+ description += " Property is in a consevation area - please check with local planning authority."
already_installed = "solar_pv" in self.property.already_installed
if already_installed:
@@ -231,7 +232,7 @@ class SolarPvRecommendations:
"description": description,
"starting_u_value": None,
"new_u_value": None,
- "sap_points": None,
+ "sap_points": minimum_sap_points,
"already_installed": already_installed,
**cost_result,
# This is required for simulating the SAP impact. solar_pv_percentage is between 0 & 1 so we
diff --git a/recommendations/VentilationRecommendations.py b/recommendations/VentilationRecommendations.py
index 9738b898..a82e4df5 100644
--- a/recommendations/VentilationRecommendations.py
+++ b/recommendations/VentilationRecommendations.py
@@ -29,7 +29,7 @@ class VentilationRecommendations(Definitions):
def identify_ventilation(self):
self.has_ventilaion = self.property.data["mechanical-ventilation"] in self.VENTILATION_DESCRIPTIONS
- def recommend(self):
+ def recommend(self, phase):
"""
If there is no ventilation, we recommend installing ventilation
@@ -63,7 +63,7 @@ class VentilationRecommendations(Definitions):
# We recommend installing two mechanical ventilation systems
self.recommendation = [
{
- "phase": None,
+ "phase": phase,
"parts": part,
"type": part[0]["type"],
"measure_type": "mechanical_ventilation",
@@ -79,7 +79,13 @@ class VentilationRecommendations(Definitions):
"total": estimated_cost,
# We use a very simple and rough estimate of 4 hours per unit
"labour_hours": labour_hours,
- "labour_days": labour_days # Assume 8 hour day
+ "labour_days": labour_days, # Assume 8 hour day
+ "simulation_config": {
+ "mechanical_ventilation_ending": "mechanical, extract only",
+ },
+ "description_simulation": {
+ "mechanical-ventilation": "mechanical, extract only"
+ }
}
]
diff --git a/recommendations/WallRecommendations.py b/recommendations/WallRecommendations.py
index c7917911..92147fb8 100644
--- a/recommendations/WallRecommendations.py
+++ b/recommendations/WallRecommendations.py
@@ -385,6 +385,11 @@ class WallRecommendations(Definitions):
if insulation_thickness == "below average":
cavity_width = cavity_width * (1 - PARTIALLY_FILLED_PERCENTAGE_ASSUMPTION)
+ non_invasive_recommendations = next(
+ (r for r in self.property.non_invasive_recommendations if
+ r["type"] == insulation_materials["type"].values[0]), {}
+ )
+
# Test the different fill options
lowest_selected_u_value = None
recommendations = []
@@ -475,14 +480,15 @@ class WallRecommendations(Definitions):
"description": description,
"starting_u_value": u_value,
"new_u_value": new_u_value,
- "sap_points": None,
+ "sap_points": non_invasive_recommendations.get("sap_points", None),
"already_installed": already_installed,
"simulation_config": simulation_config,
"description_simulation": {
"walls-description": "Cavity wall, filled cavity",
"walls-energy-eff": "Good"
},
- **cost_result
+ **cost_result,
+ "survey": non_invasive_recommendations.get("survey", False)
}
)
@@ -540,15 +546,10 @@ class WallRecommendations(Definitions):
lowest_selected_u_value = None
recommendations = []
-
- iwi_non_invasive_recommendations = next(
- (r for r in self.property.non_invasive_recommendations if r["type"] == "internal_wall_insulation"), {}
+ non_invasive_recommendations = next(
+ (r for r in self.property.non_invasive_recommendations if
+ r["type"] == insulation_materials["type"].values[0]), {}
)
- ewi_non_invasive_recommendations = next(
- (r for r in self.property.non_invasive_recommendations if r["type"] == "external_wall_insulation"), {}
- )
- if ewi_non_invasive_recommendations:
- raise NotImplementedError("Implement ewi non-invasive recommendations")
for _, insulation_material_group in insulation_materials.groupby("description"):
@@ -590,31 +591,25 @@ class WallRecommendations(Definitions):
if already_installed:
cost_result = override_costs(cost_result)
+ if non_invasive_recommendations.get("cost") is not None:
+ raise NotImplementedError(
+ "Not handled passing costs from non-invasive recommendations for iwi"
+ )
+
if material["type"] == "internal_wall_insulation":
-
- if iwi_non_invasive_recommendations.get("cost") is not None:
- raise NotImplementedError(
- "Not handled passing costs from non-invasive recommendations for iwi"
- )
-
- sap_points = iwi_non_invasive_recommendations.get("sap_points", None)
- survey = iwi_non_invasive_recommendations.get("survey", False)
-
new_description = self.get_internal_external_wall_description(
self.INTERNALLY_INSULATED_WALL_DESCRIPTIONS, new_u_value
)
-
elif material["type"] == "external_wall_insulation":
-
- sap_points = ewi_non_invasive_recommendations.get("sap_points", None)
- survey = ewi_non_invasive_recommendations.get("survey", False)
-
new_description = self.get_internal_external_wall_description(
self.EXTERNALLY_INSULATED_WALL_DESCRIPTIONS, new_u_value
)
else:
raise ValueError("Invalid material type")
+ sap_points = non_invasive_recommendations.get("sap_points", None)
+ survey = non_invasive_recommendations.get("survey", False)
+
wall_ending_config = WallAttributes(new_description).process()
walls_simulation_config = check_simulation_difference(
diff --git a/recommendations/WindowsRecommendations.py b/recommendations/WindowsRecommendations.py
index 1f755369..46e56c93 100644
--- a/recommendations/WindowsRecommendations.py
+++ b/recommendations/WindowsRecommendations.py
@@ -215,21 +215,29 @@ class WindowsRecommendations:
"glazed-type": glazed_type_ending,
}
+ measure_type = "double_glazing" if not is_secondary_glazing else "secondary_glazing"
+
+ non_invasive_recommendation = next(
+ (r for r in self.property.non_invasive_recommendations if r["type"] in ["windows_glazing", measure_type]),
+ {}
+ )
+
self.recommendation = [
{
"phase": phase,
"parts": [],
"type": "windows_glazing",
- "measure_type": "double_glazing" if not is_secondary_glazing else "secondary_glazing",
+ "measure_type": measure_type,
"description": description,
"starting_u_value": None,
"new_u_value": None,
- "sap_points": None,
+ "sap_points": non_invasive_recommendation.get("sap_points", None),
"already_installed": already_installed,
**cost_result,
"is_secondary_glazing": is_secondary_glazing,
"description_simulation": description_simulation,
"simulation_config": simulation_config,
+ "survey": non_invasive_recommendation.get("survey", None),
}
]
diff --git a/recommendations/county_to_region.py b/recommendations/county_to_region.py
index f7d5193f..13c1cdaa 100644
--- a/recommendations/county_to_region.py
+++ b/recommendations/county_to_region.py
@@ -111,8 +111,11 @@ county_to_region_map = {
'Windsor and Maidenhead': 'South East England', 'Woking': 'South East England', 'Wokingham': 'South East England',
'Worthing': 'South East England', 'Wycombe': 'South East England',
'Bath and North East Somerset': 'South West England', 'Bournemouth': 'South West England',
- 'Bristol': 'South West England', 'Cheltenham': 'South West England', 'Christchurch': 'South West England',
- 'City of Bristol': 'South West England', 'Cornwall': 'South West England', 'Cotswold': 'South West England',
+ 'Bristol': 'South West England',
+ 'Cheltenham': 'South West England', 'Christchurch': 'South West England',
+ 'City of Bristol': 'South West England',
+ 'Bristol, City of': 'South West England',
+ 'Cornwall': 'South West England', 'Cotswold': 'South West England',
'Devon': 'South West England', 'Dorset': 'South West England', 'East Devon': 'South West England',
'East Dorset': 'South West England', 'Exeter': 'South West England', 'Forest of Dean': 'South West England',
'Gloucester': 'South West England', 'Gloucestershire': 'South West England',
@@ -132,7 +135,10 @@ county_to_region_map = {
'Merthyr Tydfil': 'Wales', 'Monmouthshire': 'Wales', 'Mountain Ash': 'Wales', 'Neath Port Talbot': 'Wales',
'Newport': 'Wales', 'Pembrokeshire': 'Wales', 'Penarth': 'Wales', 'Pentre': 'Wales', 'Pontyclun': 'Wales',
'Pontypridd': 'Wales', 'Porth': 'Wales', 'Porthcawl': 'Wales', 'Powys': 'Wales', 'Rhondda Cynon Taff': 'Wales',
- 'Rhoose': 'Wales', 'Sully': 'Wales', 'Swansea': 'Wales', 'The Vale of Glamorgan': 'Wales', 'Tonypandy': 'Wales',
+ 'Rhoose': 'Wales', 'Sully': 'Wales', 'Swansea': 'Wales',
+ 'The Vale of Glamorgan': 'Wales',
+ 'Vale of Glamorgan': 'Wales',
+ 'Tonypandy': 'Wales',
'Torfaen': 'Wales', 'Treharris': 'Wales', 'Treorchy': 'Wales', 'Wrexham': 'Wales', 'Birmingham': 'West Midlands',
'Bromsgrove': 'West Midlands', 'Cannock Chase': 'West Midlands', 'Coventry': 'West Midlands',
'Dudley': 'West Midlands', 'East Staffordshire': 'West Midlands', 'Herefordshire': 'West Midlands',
diff --git a/recommendations/optimiser/optimiser_functions.py b/recommendations/optimiser/optimiser_functions.py
index c1123e3d..05b9ec42 100644
--- a/recommendations/optimiser/optimiser_functions.py
+++ b/recommendations/optimiser/optimiser_functions.py
@@ -1,10 +1,14 @@
-def prepare_input_measures(property_recommendations, goal):
+import backend.app.assumptions as assumptions
+
+
+def prepare_input_measures(property_recommendations, goal, needs_ventilation):
"""
Basic function to convert recommendations_to_upload to a format that is
suitable for the optimiser - large
:param property_recommendations: object containing the recommendations, created in the plan trigger api
:param goal: goal to be optimised for, should be one of the keys in gain_map. E.g. if the gain is SAP points,
the goal should reflect that desired gain
+ :param needs_ventilation: boolean to indicate if the property needs ventilation
:return: Nested list of input measures
"""
@@ -16,23 +20,58 @@ def prepare_input_measures(property_recommendations, goal):
if not goal_key:
raise NotImplementedError("Not implemented this gain type - investigate me")
+ # We ony ever have one ventilation measure with now
+ ventilation_recommendation = next(
+ (measure[0] for measure in property_recommendations if measure[0]["type"] == "mechanical_ventilation"),
+ {}
+ )
+
input_measures = []
for recs in property_recommendations:
+ if needs_ventilation and recs[0]["type"] == "mechanical_ventilation":
+ # If we house needs ventilation, ventilation will be packaged with the fabric measure so
+ # we don't need to optimise it independently
+ continue
+
if recs[0]["type"] == "solar_pv":
# if the recommendation is a solar recommendation with a battery, we exclude it from the optimisation.
recs = [r for r in recs if ~r["has_battery"]]
- input_measures.append(
- [
+ recs_to_append = [rec for rec in recs if rec["energy_cost_savings"] >= 0]
+ if not recs_to_append:
+ continue
+
+ to_append = []
+ for rec in recs:
+ # We bundle the impact of ventilation with the measure
+ total = (
+ rec["total"] + ventilation_recommendation["total"]
+ if rec["type"] in assumptions.measures_needing_ventilation
+ else rec["total"]
+ )
+ gain = (
+ rec[goal_key] + ventilation_recommendation[goal_key]
+ if rec["type"] in assumptions.measures_needing_ventilation
+ else rec[goal_key]
+ )
+
+ rec_type = (
+ "+".join(
+ [rec["type"], ventilation_recommendation["type"]]
+ ) if rec["type"] in assumptions.measures_needing_ventilation
+ else rec["type"]
+ )
+
+ to_append.append(
{
"id": rec["recommendation_id"],
- "cost": rec["total"],
- "gain": rec[goal_key],
- "type": rec["type"]
+ "cost": total,
+ "gain": gain,
+ "type": rec_type
}
- for rec in recs
- ]
- )
+ )
+
+ input_measures.append(to_append)
return input_measures
diff --git a/recommendations/rdsap_tables.py b/recommendations/rdsap_tables.py
index 16c7d26e..e56faf7c 100644
--- a/recommendations/rdsap_tables.py
+++ b/recommendations/rdsap_tables.py
@@ -257,7 +257,7 @@ epc_wall_description_map = {
"Timber frame, as built, partial insulation": "Timber frame as built",
"Timber frame, as built, no insulation": "Timber frame as built",
"Timber frame, with external insulation": "Timber frame with internal insulation",
-
+ "Timber frame, with internal insulation": "Timber frame with internal insulation",
############################
# Sandstone/limestones wall mappings
############################
diff --git a/recommendations/recommendation_utils.py b/recommendations/recommendation_utils.py
index 00da6107..602684cf 100644
--- a/recommendations/recommendation_utils.py
+++ b/recommendations/recommendation_utils.py
@@ -205,7 +205,7 @@ def get_wall_u_value(
mapped_value = wall_uvalues_df[
wall_uvalues_df["Wall_type"] == mapped_description
- ][age_band].values[0]
+ ][age_band].values[0]
if pd.isnull(mapped_value) and "Park home" in mapped_description:
# We don't know enough in this case so we default to 0
@@ -428,6 +428,9 @@ def estimate_number_of_floors(property_type):
Using the property type, we estimate the number of floors in the property
"""
+ if property_type is None:
+ return None
+
if property_type == "House":
number_of_floors = 2
elif property_type in ["Flat", "Bungalow"]:
@@ -560,7 +563,7 @@ def get_floor_u_value(
insulation_lookup = s11[
s11["Age_band"].str.contains(age_band) & s11["Floor_construction"]
== floor_type
- ]
+ ]
if insulation_lookup.empty:
insulation_thickness = 0
else:
diff --git a/survey_report/app.py b/survey_report/app.py
new file mode 100644
index 00000000..f6eddb8d
--- /dev/null
+++ b/survey_report/app.py
@@ -0,0 +1,270 @@
+import os
+import requests
+import PyPDF2
+from string import Template
+
+import pandas as pd
+
+from survey_report.extraction.detect_report_type import detect_report_type
+from survey_report.extraction.quidos import SiteNotesExtractor, EPRExtractor
+
+
+def generate_html_report(template_path, output_path, data):
+ """
+ Reads an HTML template file, injects dynamic values, and generates a final HTML report.
+
+ Args:
+ - template_path (str): Path to the HTML template file.
+ - output_path (str): Path to save the generated HTML file.
+ - data (dict): Dictionary containing dynamic values for the report.
+ """
+ # Read the template file
+ with open(template_path, "r", encoding="utf-8") as f:
+ html_template = Template(f.read()) # Use Template from string module
+
+ # Replace placeholders with actual data
+ final_html = html_template.safe_substitute(data) # Use safe_substitute to prevent missing key errors
+
+ # Save the generated HTML file
+ with open(output_path, "w", encoding="utf-8") as f:
+ f.write(final_html)
+
+ print(f"HTML report generated successfully: {output_path}")
+
+
+def stringify_number(num: int, rounding: bool = True) -> str:
+ if num < 100000: # 5 figures or fewer
+ rounded_num = ((num + 99) // 100) * 100 if rounding else num
+ return f"{rounded_num:,}"
+ else: # More than 5 figures
+ rounded_num = ((num + 999) // 1000) * 1000 if rounding else num
+ return f"{rounded_num // 1000}k"
+
+
+class PlacidApi:
+ # Errors as defined by docs: https://placid.app/docs/2.0/rest/errors
+ ERROR_CODES = {
+ 400: "Bad request",
+ 401: "Unauthorized",
+ 404: "Template Not found",
+ 422: "Validation error",
+ 429: "Rate limit exceeded",
+ 500: "Internal server error",
+ }
+
+ def __init__(self, api_key):
+ self.api_key = api_key
+
+ self.headers = {
+ "Authorization": f"Bearer {self.api_key}",
+ "Content-Type": "application/json",
+ "Accept": "application/json",
+ }
+
+ def create_pdf(
+ self,
+ template_uuid: str,
+ current_epc_rating: str,
+ current_epc_rating_colour: str,
+ post_retrofit_epc_rating: str,
+ post_retrofit_epc_rating_colour: str,
+ ):
+ url = "https://api.placid.app/api/rest/pdfs"
+
+ body = {
+ "webhook_success": None,
+ "passthrough": None,
+ "pages": [
+ {
+ "template_uuid": template_uuid,
+ "layers": {
+ "current_epc_rating": {
+ "text": current_epc_rating,
+ "text_color": current_epc_rating_colour,
+ },
+ "post_retrofit_epc_rating": {
+ "text": post_retrofit_epc_rating,
+ "text_color": post_retrofit_epc_rating_colour,
+ }
+ },
+ },
+ ]
+ }
+
+ response = requests.post(
+ url,
+ headers=self.headers,
+ json=body
+ )
+
+ response_body = response.json()
+
+ return response_body
+
+ def get_pdf(self, pdf_id: str):
+ """
+ Poll the API every 5 seconds until the PDF is ready
+ """
+ url = f"https://api.placid.app/api/rest/pdfs/{pdf_id}"
+
+ response = requests.get(
+ url,
+ headers=self.headers
+ )
+ response_body = response.json()
+
+ url = response_body["pdf_url"]
+ # Download the PDF form this uurl
+ pdf_download = requests.get(url)
+ with open("survey_report/example_data/output.pdf", "wb") as f:
+ f.write(pdf_download.content)
+
+
+def handler():
+ """
+ Performs the data extraction process for the survey report
+ :return:
+ """
+
+ PLACID_API_KEY = "placid-mpkwidzer2mens9h-hifa3dmbxpfeghpa"
+ TEMPLATE_UUID = "5bst9mh1q9lk9"
+ placid_api = PlacidApi(PLACID_API_KEY)
+
+ current_property_value = 250000 # Needs to be an input
+
+ EPC_COLOURS = {
+ "A": "#117d58",
+ "B": "#2da55c",
+ "C": "#8dbd40",
+ "D": "#f7cd14",
+ "E": "#f3a96a",
+ "F": "#ef8026",
+ "G": "#e41e3b",
+ }
+
+ folders = [
+ {
+ "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 "
+ "WILLIS ROAD FLAT 1 PRE EPR SITE NOTES.pdf",
+ "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 WILLIS "
+ "ROAD FLAT 1 PRE EPR PDF.pdf",
+ "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data"
+ "/Flat 1/3 WILLIS ROAD FLAT 1 POST EPR SITE NOTES.pdf"
+ },
+ {
+ "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 "
+ "WILLIS ROAD FLAT 2 PRE EPR SITE NOTES.pdf",
+ "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 WILLIS "
+ "ROAD FLAT 2 PRE EPR PDF.pdf",
+ "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data"
+ "/Flat 2/3 WILLIS ROAD FLAT 2 POST EPR SITE NOTES.pdf"
+ },
+ {
+ "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 "
+ "WILLIS ROAD FLAT 3 PRE EPR SITE NOTES.pdf",
+ "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 WILLIS "
+ "ROAD FLAT 3 PRE EPR PDF.pdf",
+ "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data"
+ "/Flat 3/3 WILLIS ROAD FLAT 3 POST EPR SITE NOTES.pdf"
+ },
+ ]
+
+ data = []
+ for data_config in folders:
+
+ file_mapping = {}
+ for filename, filepath in data_config.items():
+ with (open(filepath, "rb") as f):
+ pdf = PyPDF2.PdfReader(f)
+ first_page = pdf.pages[0].extract_text()
+ text = ""
+ for page in pdf.pages:
+ text += page.extract_text()
+
+ # Check the report type
+ report_type = detect_report_type(first_page)
+ if report_type is not None:
+ file_mapping[filename] = text
+
+ # This is only set up to work with quido site notes so we must have it
+ site_notes_extractor = SiteNotesExtractor(file_mapping["site_notes"])
+ site_notes = site_notes_extractor.extract_all()
+
+ # We also must have an EPR
+ epr_extractor = EPRExtractor(file_mapping["epr"])
+ epr = epr_extractor.extract_all()
+
+ # Valuation simulation
+ scenario_site_notes_extractor = SiteNotesExtractor(file_mapping["scenario_site_notes"])
+ scenario_site_notes = scenario_site_notes_extractor.extract_all()
+
+ from backend.ml_models.Valuation import PropertyValuation
+ valuation_uplift = PropertyValuation.estimate_valuation_improvement(
+ current_value=current_property_value,
+ current_epc=site_notes["Current EPC Band"],
+ target_epc=scenario_site_notes["Current EPC Band"],
+ )
+ # TODO - should convert this, when it's more than 5 figures and we should certainly stringify this
+
+ valuation_difference = round(valuation_uplift["average_increased_value"] - current_property_value)
+
+ # Prepare the data for output
+ bill_savings = round(
+ site_notes['Estimated Annual Energy Cost (£)'] - scenario_site_notes['Estimated Annual Energy Cost (£)']
+ )
+
+ carbon_savings = round(
+ site_notes["Current Carbon Emissions (TCO2)"] - scenario_site_notes["Current Carbon Emissions (TCO2)"],
+ 2
+ )
+
+ payback_period = None
+ if payback_period is None:
+ raise NotImplementedError("Implement me")
+
+ # We extract the measures from the site notes
+
+ report_data = {
+ "current_epc_rating": site_notes["Current EPC Band"],
+ "current_epc_rating_colour": EPC_COLOURS[site_notes["Current EPC Band"]],
+ "post_retrofit_epc_rating": scenario_site_notes["Current EPC Band"],
+ "post_retrofit_epc_rating_colour": EPC_COLOURS[scenario_site_notes["Current EPC Band"]],
+ "bill_savings": stringify_number(bill_savings),
+ "valuation_improvement": stringify_number(valuation_difference),
+ "carbon_savings": carbon_savings,
+
+ }
+
+ # We now produce the combined data sheet which is the starting figure:
+ # data_sheet = {**epr, **site_notes}
+ # del data_sheet['Building Dimensions']
+ # # We unnest the Total Building Dimensions
+ # data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"]
+ # data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"]
+ # del data_sheet["Total Building Dimensions"]
+
+ create_pdf_response = placid_api.create_pdf(
+ template_uuid=TEMPLATE_UUID, **report_data
+ )
+ # {'id': 769832, 'type': 'pdf', 'status': 'queued', 'pdf_url': None, 'transfer_url': None, 'passthrough': None}
+ # Download locally
+ placid_api.get_pdf(create_pdf_response["id"])
+
+ data = pd.DataFrame(data)
+
+ # Generate the HTML report
+ # Placeholder locations
+ template_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/template.html"
+ output_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/output/report.html"
+ logo_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/assets/logo.png"
+ generate_html_report(
+ template_path, output_path,
+ data={
+ "address": data_sheet["Address"],
+ "logo_path": logo_path,
+ "current_epc": data_sheet["Current EPC Band"],
+ "current_sap": data_sheet["Current SAP Rating"],
+ "potential_epc": "A", # TODO PLACEHOLDER
+ "potential_sap": 91, # TODO PLACEHOLDER
+ }
+ )
diff --git a/survey_report/extraction/detect_report_type.py b/survey_report/extraction/detect_report_type.py
new file mode 100644
index 00000000..434a3fb4
--- /dev/null
+++ b/survey_report/extraction/detect_report_type.py
@@ -0,0 +1,22 @@
+import re
+
+
+def detect_report_type(first_page):
+ """
+ Detects the type of report based on the first page of the report
+ :param first_page:
+ :return:
+ """
+ # Set up for the minute to handle quidos files. We have the Elmhurst logic so we can introduce
+ # this when we need
+
+ if re.match(
+ r"^Created \d{2}/\d{2}/\d{4} for Quidos Ltd using Argyle software BRE approved calculator",
+ first_page
+ ):
+ return "quidos_site_notes"
+
+ if re.search(r"\nIQ-Energy\nEnergy Performance Report\nPage 1 of 1", first_page):
+ return "quidos_epr"
+
+ return None
diff --git a/survey_report/extraction/quidos.py b/survey_report/extraction/quidos.py
new file mode 100644
index 00000000..2e772886
--- /dev/null
+++ b/survey_report/extraction/quidos.py
@@ -0,0 +1,256 @@
+import re
+
+
+class SiteNotesExtractor:
+ """
+ Extracts SAP rating, carbon emissions, and building dimensions from an EPC summary report.
+ """
+
+ def __init__(self, pdf_text):
+ """
+ Initializes the SiteNotesExtractor with the extracted PDF text.
+ """
+ self.text = pdf_text
+ self.data = {}
+
+ def extract_sap_rating(self):
+ """
+ Extracts the current and potential SAP rating from the report.
+ """
+ pattern = re.search(r"Current SAP rating\s*([A-G])\s*(\d+)\s*Potential SAP rating\s*([A-G])\s*(\d+)", self.text)
+
+ if not pattern:
+ raise ValueError("No SAP rating found in the report")
+
+ self.data.update({
+ "Current EPC Band": pattern.group(1),
+ "Current SAP Rating": int(pattern.group(2)),
+ "Potential EPC Band": pattern.group(3),
+ "Potential SAP Rating": int(pattern.group(4)),
+ })
+
+ def extract_carbon_emissions(self):
+ """
+ Extracts the current and adjusted annual carbon emissions (TCO2).
+ """
+ pattern = re.search(r"Current annual emissions\s*([\d.]+)\s*\(TCO2\)", self.text)
+
+ if not pattern:
+ raise ValueError("No carbon emissions found in the report")
+
+ self.data.update({
+ "Current Carbon Emissions (TCO2)": float(pattern.group(1)),
+ })
+
+ def extract_building_dimensions(self):
+ """
+ Extracts dimensions for each building part and stores them in a list.
+ Handles Main Property and multiple extensions.
+ """
+
+ # Locate the Dimensions section
+ dimensions_section = re.search(
+ r"Dimension Type (?:internal|external)\nPart Floor Area \(m2\) Room Height \(m\) Loss Perimeter \(m\) "
+ r"Party Wall "
+ r"Length \(m\)\n"
+ r"(.*?)\n5\.0 Conservatory", self.text, re.DOTALL
+ )
+
+ if not dimensions_section:
+ raise ValueError("Failed to locate the dimensions section in the text.")
+
+ dimensions_text = dimensions_section.group(1)
+
+ # Pattern to match each building part (Main Property, Extension 1, Extension 2, etc.)
+ building_part_pattern = re.compile(
+ r"(Main Property|Extension \d+)\s*(?:Property)?\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
+ )
+
+ building_parts = []
+ for match in building_part_pattern.finditer(dimensions_text):
+ to_append = {
+ "Building Part": match.group(1).strip(),
+ "Part Floor Area (m2)": float(match.group(2)),
+ "Room Height (m)": float(match.group(3)),
+ "Loss Perimeter (m)": float(match.group(4)),
+ "Party Wall Length (m)": float(match.group(5)),
+ }
+ # We calculate the heat loss area
+ to_append["Heat Loss Area (m2)"] = to_append["Loss Perimeter (m)"] * to_append["Room Height (m)"]
+ building_parts.append(to_append)
+
+ if not building_parts:
+ raise ValueError("No building dimensions found in the report")
+
+ self.data["Building Dimensions"] = building_parts
+ # We calculate some totals
+ self.data["Total Building Dimensions"] = {
+ "floor_area": sum([part["Part Floor Area (m2)"] for part in building_parts]),
+ "heat_loss_area": sum([part["Heat Loss Area (m2)"] for part in building_parts]),
+ }
+
+ def extract_bills_estimate(self):
+ """
+ Extracts the estimated annual energy costs (£) from the report.
+ """
+ pattern = re.search(r"Current annual energy costs £\s*([\d,.]+)", self.text)
+
+ if not pattern:
+ raise ValueError("No bills estimate found in the report")
+
+ self.data["Estimated Annual Energy Cost (£)"] = float(pattern.group(1).replace(",", ""))
+
+ def extract_all(self):
+ """
+ Runs all extraction methods and returns a dictionary with extracted data.
+ """
+ self.extract_sap_rating()
+ self.extract_carbon_emissions()
+ self.extract_bills_estimate()
+ self.extract_building_dimensions()
+
+ # Extract specific measures
+ # Primary wall
+ # Secondary wall
+ # Roof
+ # Floor
+ # Heating system
+ # Hot water system
+ # Windows
+ # Doors
+ # Lighting
+ # Ventilation
+ # Solar
+
+ return self.data
+
+ def extract_walls(self):
+ """
+ Extracts wall type, insulation, dry-lining, and thickness for each building part,
+ including any alternative wall details within the 7.0 Walls section of the summary PDF text.
+ """
+
+ text = self.text
+ wall_data = []
+
+ # Isolate the 7.0 Walls section
+ wall_section_match = re.search(r"7\.0 Walls\n(.*?)\n8\.0 Roofs", text, re.DOTALL)
+ if not wall_section_match:
+ raise ValueError("Failed to locate the walls section in the text.")
+
+ wall_section = wall_section_match.group(1)
+
+ # Define patterns to match walls for each building part
+ wall_pattern = re.compile(
+ r"(?PMain Property(?: Alternative)?|Extension \d+)\s*\n"
+ r"(?:Construction\s*(?P[^\n]*)\n)?"
+ r"(?:Insulation\s*(?P[^\n]*)\n)?"
+ r"(?:Insulation Thickness\(mm\)\s*(?P[^\n]*)\n)?"
+ r"(?:Wall Thickness Measured\?\s*(?P[^\n]*)\n)?"
+ r"(?:Wall Thickness\(mm\)\s*(?P\d+))?",
+ re.MULTILINE
+ )
+
+ # TODO: We aren't effectively picking up alternative walls
+ # alt_wall_pattern = re.compile(
+ # r"Alternative Wall Sheltered\s*.*?\n"
+ # r".*?Construction\s*(?P[^\n]*)\n"
+ # r"Insulation\s*(?P[^\n]*)\n"
+ # r"Insulation Thickness\(mm\)\s*(?P[^\n]*)\n"
+ # r"Wall Thickness Measured\?\s*(?P[^\n]*)\n"
+ # r"Wall Thickness\(mm\)\s*(?P\d+)?",
+ # re.MULTILINE
+ # )
+
+ for match in wall_pattern.finditer(wall_section):
+ building_part = match.group("section")
+ # has_alternative_wall = "Alternative" in building_part
+ building_part = "Main Property" if "Main Property" in building_part else building_part
+
+ wall_entry = {
+ "Building Part": building_part,
+ "Wall Type": match.group("construction") or "Unknown",
+ "Wall Insulation": match.group("insulation") or "Unknown",
+ "Insulation Thickness (mm)": match.group("insulation_thickness") or "Unknown",
+ "Wall Thickness Measured": match.group("thickness_measured") or "Unknown",
+ "Wall Thickness (mm)": int(match.group("thickness")) if match.group("thickness") and match.group(
+ "thickness").isdigit() else None,
+ "Alternative Wall Type": None,
+ "Alternative Wall Insulation": None,
+ "Alternative Insulation Thickness (mm)": None,
+ "Alternative Wall Thickness Measured": None,
+ "Alternative Wall Thickness (mm)": None,
+ }
+
+ # Check if an alternative wall section exists
+ # if has_alternative_wall:
+ # alt_match = alt_wall_pattern.search(wall_section, match.end())
+ # if alt_match:
+ # wall_entry["Alternative Wall Type"] = alt_match.group("alt_construction") or "Unknown"
+ # wall_entry["Alternative Wall Insulation"] = alt_match.group("alt_insulation") or "Unknown"
+ # wall_entry["Alternative Insulation Thickness (mm)"] = alt_match.group(
+ # "alt_insulation_thickness") or "Unknown"
+ # wall_entry["Alternative Wall Thickness Measured"] = alt_match.group(
+ # "alt_thickness_measured") or "Unknown"
+ # wall_entry["Alternative Wall Thickness (mm)"] = int(
+ # alt_match.group("alt_thickness")) if alt_match.group("alt_thickness") and alt_match.group(
+ # "alt_thickness").isdigit() else None
+
+ wall_data.append(wall_entry)
+
+ return wall_data
+
+
+class EPRExtractor:
+ """
+ Extracts space heating, water heating, and address from an Energy Performance Report (EPR).
+ """
+
+ def __init__(self, pdf_text):
+ """
+ Initializes the EPRExtractor with the extracted PDF text.
+ """
+ self.text = pdf_text
+ self.data = {}
+
+ def extract_heating_consumption(self):
+ """
+ Extracts space heating and water heating values from the report.
+ """
+ pattern = re.search(
+ r"Space Heating\(KWH\)\s*([\d,]+).*?\nWater Heating\(KWH\)\s*([\d,]+)",
+ self.text,
+ re.DOTALL
+ )
+
+ if not pattern:
+ raise ValueError("No heating data found in the report")
+
+ self.data.update({
+ "Space Heating (KWH)": int(pattern.group(1).replace(",", "")),
+ "Water Heating (KWH)": int(pattern.group(2).replace(",", ""))
+ })
+
+ def extract_address(self):
+ """
+ Extracts the full address from the report.
+ """
+ pattern = re.search(
+ r"Address\s*(.*?)\nTown\s*(.*?)\n",
+ self.text,
+ re.DOTALL
+ )
+
+ if not pattern:
+ raise ValueError("No address found in the report")
+
+ full_address = pattern.group(1).strip()
+ self.data["Address"] = full_address
+
+ def extract_all(self):
+ """
+ Runs all extraction methods and returns a dictionary with extracted data.
+ """
+ self.extract_address()
+ self.extract_heating_consumption()
+ return self.data
diff --git a/survey_report/requirements.txt b/survey_report/requirements.txt
new file mode 100644
index 00000000..e69de29b
diff --git a/survey_report/template.html b/survey_report/template.html
new file mode 100644
index 00000000..5d3b6c63
--- /dev/null
+++ b/survey_report/template.html
@@ -0,0 +1,123 @@
+
+
+
+
+
+ Domna Energy Report
+
+
+
+
+
+
+
+
+
+
+
+
Current EPC Rating
+
${current_epc}
+
SAP ${current_sap}
+
+
+
+
Potential EPC Rating
+
${potential_epc}
+
SAP ${potential_sap}
+
+
+
+
+
+
+
diff --git a/utils/OsmosisCondtionReportParser.py b/utils/OsmosisCondtionReportParser.py
new file mode 100644
index 00000000..4d8873a2
--- /dev/null
+++ b/utils/OsmosisCondtionReportParser.py
@@ -0,0 +1,49 @@
+import re
+import boto3
+import PyPDF2
+import fitz
+
+
+class OsmosisConditionReportParser:
+
+ def __init__(self, filekey, bucket_name=None):
+ self.s3_client = boto3.client('s3')
+ self.bucket_name = bucket_name
+ self.filekey = filekey
+ self.pdf_text = None
+
+ self._read_file()
+
+ def _read_file(self):
+ """
+ Reads the XML file either locally or from S3 and parses it using minidom.
+
+ Raises:
+ ValueError: If the file cannot be found, read, or parsed.
+ """
+
+ chunk_size = 10
+
+ try:
+ if self.bucket_name:
+ # Read from S3
+ raise NotImplementedError("Imeplement me")
+ else:
+
+ with fitz.open(self.filekey) as pdf:
+ text = ""
+ for page in pdf:
+ text += page.get_text()
+
+ # Parse the XML content using minidom
+ self.pdf_text = text
+ except FileNotFoundError:
+ raise ValueError(f"Local file not found: {self.filekey}")
+ except Exception as e:
+ raise ValueError(f"An error occurred while reading or parsing the XML: {e}")
+
+ def extract(self):
+ return {
+ "No. of Bedrooms": int(re.search(r"No\. of Bedrooms \(Total\)\s*(\d+)", self.pdf_text).group(1)),
+ "Risk Assessment Pathway": re.search(r"Risk\s*Assessment\s*Pathway\s*([A-Z])", self.pdf_text).group(1)
+ }
diff --git a/utils/file_data_extraction.py b/utils/file_data_extraction.py
new file mode 100644
index 00000000..2e849ef5
--- /dev/null
+++ b/utils/file_data_extraction.py
@@ -0,0 +1,1150 @@
+import PyPDF2
+import re
+import pdfplumber
+from collections import Counter
+from utils.logger import setup_logger
+from xml.dom.minidom import parseString
+from pdf2image import convert_from_path
+from pytesseract import image_to_string
+
+logger = setup_logger()
+
+"""
+This script contains functions used to extract data from retrofit survey files, including EPRs,
+summary reports, etc
+"""
+
+
+def is_elmhurst_energy_report(text):
+ """
+ Determines if the provided text indicates that the PDF is an Energy Report.
+ Returns True if the text contains 'Energy Report'.
+ """
+ return text.startswith("ENERGY REPORT")
+
+
+def is_elmhurst_summary_report(text):
+ """
+ Determines if the provided text indicates that the PDF is a Summary Report.
+ """
+ return text.startswith("Summary Information")
+
+
+def is_osmosis_condition_report(text):
+ """
+ Determines if the provided text indicates that the PDF is a Condition Report.
+ """
+ return text.startswith("OsmosisACDNEWPAS2035ConditionReport") or text.startswith("OsmosisACDPAS2035ConditionReport")
+
+
+def is_elmhurst_evidence_report(text):
+ """
+ Determines if the provided text indicates that the PDF is an Elmhurst Evidence Report.
+ """
+ return text.startswith("RdSAP Evidence Report")
+
+
+def is_pulse_air_permeability(text):
+ """
+ Determines if the provided text indicates that the PDF is a Pulse Air Permeability Report.
+ """
+ return text.startswith("Air Permeability Test Report @O PULSE")
+
+
+def is_elmhurst_project_handover(text):
+ """
+ Determines if the provided text indicates that the PDF is an Elmhurst Project Handover Report.
+ """
+ return "Retrofit_Project_Handover" in text or "Retrofit Project Handover" in text
+
+
+def is_core_logic_pas_assessment_report(text):
+ """
+ Determines if the provided text indicates that the PDF is a PAS Assessment Report.
+ """
+ return text.startswith("Generated Using CoreLogic UK PAS Assessment")
+
+
+def detect_pdf_report_type(pdf_path):
+ """
+ Detects the type of report based on content or filename.
+ :param pdf_path: String path to the PDF file
+ :return: String type of the report ("epr", "summary", or None)
+ """
+ # Attempt to read the first page of the PDF to determine type
+ with open(pdf_path, "rb") as file:
+ reader = PyPDF2.PdfReader(file)
+ first_page_text = reader.pages[0].extract_text() if reader.pages else ""
+
+ if first_page_text == "":
+ # Convert PDF pages to images
+ logger.info("Extracting text from PDF images..., this may take a moment.")
+ pages = convert_from_path(pdf_path, dpi=300)
+ if pages:
+ first_page_text = image_to_string(pages[0])
+
+ if is_elmhurst_energy_report(first_page_text):
+ return "elmhurst epr"
+ elif is_elmhurst_summary_report(first_page_text):
+ return "elmhurst summary report"
+ elif is_osmosis_condition_report(first_page_text):
+ return "osmosis condition report"
+ elif is_elmhurst_evidence_report(first_page_text):
+ return "elmhurst evidence report"
+ elif is_pulse_air_permeability(first_page_text):
+ return "pulse air permeability"
+ elif is_elmhurst_project_handover(first_page_text):
+ return "elmhurst project handover"
+ elif is_core_logic_pas_assessment_report(first_page_text):
+ return "core logic pas assessment report"
+
+ return None
+
+
+def detect_xml_report_type(xml_path):
+ """
+ Detects the type of XML report based on content or filename.
+ :param xml_path: String path to the XML file
+ :return: String type of the report ("full sap xml", or None)
+ """
+ # Attempt to read the first page of the PDF to determine type
+ with open(xml_path, "r") as file:
+ contents = file.read()
+
+ contents = parseString(contents)
+ product_tag_search = contents.getElementsByTagName("Product")
+ if product_tag_search:
+ if product_tag_search[0].firstChild.nodeValue == "Sap 2012 Desktop":
+ return "full sap xml"
+
+ raise Exception("Not implemented")
+
+
+def is_pdf(filename):
+ """
+ Determines if the provided filename is a PDF file.
+ """
+ return filename.endswith(".pdf")
+
+
+def is_xml(filename):
+ """
+ Determines if the provided filename is an XML file.
+ """
+ return filename.endswith(".xml")
+
+
+class ElmhurstEprExtractor:
+ """
+ A utility class for extracting specific data from Elmhurst Energy Performance Reports (EPR).
+ """
+
+ def __init__(self, file_path):
+ self.file_path = file_path
+
+ @staticmethod
+ def extract_window_age_description(windows_text):
+ """
+ Extracts the most common window age description and its proportion.
+ """
+ windows_text = windows_text.replace("\n", "")
+ window_descriptions = [
+ "Double post or during 2002",
+ "Double pre 2002",
+ "Double with unknown install date",
+ "Secondary glazing",
+ "Triple glazing",
+ "Single glazing",
+ ]
+ description_counts = Counter()
+ for description in window_descriptions:
+ matches = re.findall(re.escape(description), windows_text)
+ description_counts[description] = len(matches)
+
+ if not description_counts or not sum(description_counts.values()):
+ raise ValueError("Failed to extract window data.")
+
+ most_common_description, window_count = description_counts.most_common(1)[0]
+ window_proportion = window_count / sum(description_counts.values()) * 100
+
+ if window_proportion == 100:
+ second_most_common_description = None
+ second_most_common_proportion = 0
+ else:
+ second_most_common_description, second_window_count = description_counts.most_common(2)[1]
+ second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100
+
+ return {
+ "Window Age Description": most_common_description,
+ "Window Age Description Proportion (%)": window_proportion,
+ "Secondary Window Age Description": second_most_common_description,
+ "Secondary Window Age Description Proportion (%)": second_most_common_proportion,
+ "Number of Windows": sum(description_counts.values())
+ }
+
+ @staticmethod
+ def extract_building_parts(text):
+ """
+ Extracts building parts and associated dimensions from the provided text.
+ """
+ data = []
+ building_part_pattern = re.compile(
+ r"Construction details: Building part: (.*?)\nFloor Area \[m2\] Room Height \[m\] Perimeter \[m\] Party "
+ r"Wall Length \[m\]\n(.*?)(?=Construction details|Data inputs|$)",
+ re.DOTALL
+ )
+ for match in building_part_pattern.finditer(text):
+ part_name = match.group(1).strip()
+ floor_data = match.group(2)
+ room_in_roof_match = re.search(r"Room\(s\) in Roof area:\s*([\d.]+)", part_name)
+ if room_in_roof_match:
+ floor_area = float(room_in_roof_match.group(1))
+ cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip()
+ data.append({
+ "Building Part": cleaned_part_name,
+ "Floor Level": "Room in Roof",
+ "Floor Area (m2)": floor_area,
+ "Room Height (m)": None,
+ "Perimeter (m)": None,
+ "Party Wall Length (m)": None
+ })
+ else:
+ cleaned_part_name = re.sub(r" - built in.*", "", part_name).strip()
+
+ floor_pattern = re.compile(
+ r"(Lowest floor|First floor|Second floor)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
+ )
+ for floor_match in floor_pattern.finditer(floor_data):
+ floor_level = floor_match.group(1)
+ floor_area = float(floor_match.group(2))
+ room_height = float(floor_match.group(3))
+ perimeter = float(floor_match.group(4))
+ party_wall_length = float(floor_match.group(5))
+ data.append({
+ "Building Part": cleaned_part_name,
+ "Floor Level": floor_level,
+ "Floor Area (m2)": floor_area,
+ "Room Height (m)": room_height,
+ "Perimeter (m)": perimeter,
+ "Party Wall Length (m)": party_wall_length
+ })
+
+ return data
+
+ @staticmethod
+ def extract_roof_details(text):
+ """
+ Extracts roof details for each building part in the provided text.
+ """
+ roof_data = []
+ building_part_pattern = re.compile(
+ r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)",
+ re.DOTALL
+ )
+ for match in building_part_pattern.finditer(text):
+ part_name = match.group(1).strip()
+ cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip()
+ part_details = match.group(2)
+ roof_type_match = re.search(r"Roof Type:\s*(.*?)(?=\n|$)", part_details)
+ roof_insulation_match = re.search(r"Roof Insulation:\s*(.*?)(?=\n|$)", part_details)
+ roof_insulation_thickness_match = re.search(r"Roof Insulation Thickness:\s*(.*?)(?=\n|$)", part_details)
+
+ roof_data.append({
+ "Building Part": cleaned_part_name,
+ "Roof Type": roof_type_match.group(1).strip() if roof_type_match else None,
+ "Roof Insulation": roof_insulation_match.group(1).strip() if roof_insulation_match else None,
+ "Roof Insulation Thickness": roof_insulation_thickness_match.group(
+ 1).strip() if roof_insulation_thickness_match else None,
+ })
+
+ return roof_data
+
+ @staticmethod
+ def extract_wall_details(text):
+ """
+ Extracts wall details for each building part in the provided text.
+ """
+ wall_data = []
+ building_part_pattern = re.compile(
+ r"Construction details: Building part: (.*?)\n(.*?)(?=Conservatory|Construction details|$)",
+ re.DOTALL
+ )
+ for match in building_part_pattern.finditer(text):
+ part_name = match.group(1).strip()
+ cleaned_part_name = re.sub(r" - built in.*|Room\(s\) in Roof area:.*", "", part_name).strip()
+ part_details = match.group(2)
+ wall_type_match = re.search(r"Wall Type:\s*(.*?)(?=\n|$)", part_details)
+ wall_insulation_match = re.search(r"Wall Insulation:\s*(.*?)(?=\n|$)", part_details)
+ wall_drylining_match = re.search(r"Wall Dry-lining:\s*(.*?)(?=\n|$)", part_details)
+ wall_thickness_match = re.search(r"Wall Thickness:\s*(\d+)(?=\n|$)", part_details)
+
+ wall_data.append({
+ "Building Part": cleaned_part_name,
+ "Wall Type": wall_type_match.group(1).strip() if wall_type_match else None,
+ "Wall Insulation": wall_insulation_match.group(1).strip() if wall_insulation_match else None,
+ "Wall Dry-lining": wall_drylining_match.group(1).strip() if wall_drylining_match else None,
+ "Wall Thickness": int(wall_thickness_match.group(1)) if wall_thickness_match else None,
+ })
+
+ return wall_data
+
+ @staticmethod
+ def extract_conservatory(text):
+ """
+ Extracts conservatory data from the provided text.
+ The section is located between "Conservatory" and "Doors".
+
+ Args:
+ text (str): The full text of the EPR PDF.
+
+ Returns:
+ dict: A dictionary with conservatory details:
+ - "Conservatory Present"
+ - "Conservatory Separated"
+ - "Conservatory Floor Area"
+ - "Conservatory Double Glazed"
+ - "Conservatory Glazed Perimeter"
+ - "Heated Conservatory Height"
+ """
+
+ conservatory_match = re.search(r"Conservatory\s*(.*?)\s*Doors", text, re.DOTALL)
+ if not conservatory_match:
+ logger.error("Failed to extract conservatory data.")
+ raise ValueError("Could not extract conservatory data.")
+
+ conservatory_text = conservatory_match.group(1)
+
+ # Check if conservatory is present
+ present_match = re.search(r"Conservatory Present:\s*(Yes|No)", conservatory_text)
+
+ if not present_match or present_match.group(1).strip() == "No":
+ logger.info("Conservatory not present.")
+ return {
+ "Conservatory Present": "No",
+ "Conservatory Separated": "",
+ "Conservatory Floor Area": 0,
+ "Conservatory Double Glazed": "",
+ "Conservatory Glazed Perimeter": 0,
+ "Heated Conservatory Height": "",
+ }
+
+ # Extract conservatory details
+ separated_match = re.search(r"Conservatory Separated:\s*(Yes|No)", conservatory_text)
+ floor_area_match = re.search(r"Conservatory Floor Area:\s*([\d.]+)", conservatory_text)
+ double_glazed_match = re.search(r"Conservatory Double Glazed:\s*(Yes|No)", conservatory_text)
+ glazed_perimeter_match = re.search(r"Conservatory Glazed Perimeter:\s*([\d.]+)", conservatory_text)
+ height_match = re.search(r"Heated Conservatory Height:\s*(.*?)(?=\n|$)", conservatory_text)
+
+ return {
+ "Conservatory Present": "Yes",
+ "Conservatory Separated": separated_match.group(1).strip() if separated_match else "",
+ "Conservatory Floor Area": float(floor_area_match.group(1)) if floor_area_match else 0,
+ "Conservatory Double Glazed": double_glazed_match.group(1).strip() if double_glazed_match else "",
+ "Conservatory Glazed Perimeter": float(glazed_perimeter_match.group(1)) if glazed_perimeter_match else 0,
+ "Heated Conservatory Height": height_match.group(1).strip() if height_match else "",
+ }
+
+ @staticmethod
+ def _extract_heating_details(section_text, default_value=""):
+ """
+ Extracts heating details from a given section of text.
+
+ Args:
+ section_text (str): The section of text containing heating details.
+ default_value (str, optional): The default value to return for missing fields. Defaults to "".
+
+ Returns:
+ dict: A dictionary containing heating system details.
+ """
+ system_search = re.search(r"Main Heating Code\s*(.*?)\n", section_text)
+ pcdf_search = re.search(r"PCDF boiler Reference\s*(\d+)", section_text)
+ controls_search = re.search(r"Main Heating Controls\s*(.*?)\n", section_text)
+ heat_search = re.search(r"Percentage of Heat\s*(\d+)\s*%?", section_text)
+
+ return {
+ "System": system_search.group(1).strip() if system_search else default_value,
+ "PCDF Reference": pcdf_search.group(1) if pcdf_search else default_value,
+ "Controls": controls_search.group(1).strip() if controls_search else default_value,
+ "% of Heat": int(heat_search.group(1)) if heat_search else 0,
+ }
+
+ def extract_primary_heating(self, text):
+
+ # Extract Primary Heating Section (Main Heating 1)
+ primary_heating_section1 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Main\s*Heating\s*2", text, re.DOTALL)
+ # We may not have a secondary heating
+ primary_heating_section2 = re.search(r"Main\s*Heating\s*1\s*(.*?)\s*Secondary\s*Heating", text, re.DOTALL)
+ primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2
+ primary_text = primary_heating_section.group(1)
+
+ return self._extract_heating_details(primary_text)
+
+ def extract_secondary_heating_details(self, text):
+ # Extract Secondary Heating Section (Main Heating 2)
+ secondary_heating_section = re.search(r"Main\s*Heating\s*2\s*(.*?)\s*Secondary Heating", text, re.DOTALL)
+
+ output = {}
+ if secondary_heating_section is None:
+
+ output["System"] = ""
+ output["PCDF Reference"] = ""
+ output["Controls"] = ""
+ output["% of Heat"] = 0
+
+ else:
+ secondary_text = secondary_heating_section.group(1)
+ output.update(
+ **self._extract_heating_details(secondary_text)
+ )
+
+ output["Heating Code"] = (
+ re.search(r"Secondary Heating Code\s*(.*?)\n", text).group(1).strip()
+ if output["System"] and re.search(r"Secondary Heating Code\s*(.*?)\n", text)
+ else ""
+ )
+
+ return output
+
+ def extract(self):
+ """
+ Extracts all relevant data from the EPR PDF.
+
+ Returns:
+ dict: A dictionary containing extracted data, including:
+ - Address and Postcode
+ - SAP Rating and Primary Energy Use
+ - Lighting, Doors, Windows, Roof, and Wall Details
+ - Heating systems (Primary and Secondary)
+ - Building Parts
+ """
+ data = {}
+
+ with open(self.file_path, "rb") as file:
+ reader = PyPDF2.PdfReader(file)
+ text = "".join(page.extract_text() for page in reader.pages)
+
+ data["Assessor Name"] = re.search(r"Created by:\s*(.*?)\n", text).group(1).strip()
+ data["Assessment Date"] = re.search(r"\nAssessment Date\s*(.*?)\n", text).group(1).strip()
+
+ # Extracting individual components
+ address_match = re.search(r"ENERGY REPORT\nDwelling Address\s*(.*?)\s*\nReference", text, re.DOTALL)
+ if not address_match:
+ logger.error("Failed to extract address.")
+ raise ValueError("Failed to extract address.")
+ data["Address"] = address_match.group(1).strip()
+ data["Postcode"] = data["Address"].split(",")[-1].strip()
+
+ # TODO:
+ data["Region"] = None
+ data["House Name"] = None
+ data["House No"] = None
+ data["Street"] = None
+ data["Locality"] = None
+ data["Town"] = None
+ data["County"] = None
+
+ sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text)
+ if not sap_match:
+ logger.error("Failed to extract SAP rating.")
+ raise ValueError("Failed to extract SAP rating.")
+ data["Current SAP Rating"] = int(sap_match.group(1))
+
+ energy_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text)
+ if not energy_match:
+ logger.error("Failed to extract primary energy use.")
+ raise ValueError("Failed to extract primary energy use.")
+ data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(energy_match.group(1))
+
+ storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
+ if not storeys_match:
+ logger.error("Failed to extract the number of storeys.")
+ raise ValueError("Failed to extract the number of storeys.")
+ data["Number of Storeys"] = int(storeys_match.group(1))
+
+ fuel_match = re.search(r"TOTAL\s*£(\d+)", text)
+ if not fuel_match:
+ logger.error("Failed to extract fuel bill.")
+ raise ValueError("Failed to extract fuel bill.")
+ data["Fuel Bill"] = f"£{fuel_match.group(1)}"
+
+ total_doors_match = re.search(r"Total Doors:\s*(\d+)", text)
+ if not total_doors_match:
+ logger.error("Failed to extract total doors.")
+ raise ValueError("Failed to extract total doors.")
+ data["Total Number of Doors"] = int(total_doors_match.group(1))
+
+ # Extract Number of Insulated Doors
+ insulated_doors_match = re.search(r"Insulated Doors:\s*(\d+)", text)
+ if not insulated_doors_match:
+ logger.error("Failed to extract insulated doors.")
+ raise ValueError("Failed to extract insulated doors.")
+ data["Number of Insulated Doors"] = int(insulated_doors_match.group(1))
+
+ # Get number of lighting outlets and number of fittings needing LEL
+ lighting_fittings_match = re.search(r"Total number of light fittings\s*(\d+)", text)
+ if not lighting_fittings_match:
+ logger.error("Failed to extract lighting.")
+ raise ValueError("Failed to extract lighting")
+ data["Number of Light Fittings"] = int(lighting_fittings_match.group(1))
+ lel_fittings_match = re.search(r"Total number of L.E.L. fittings\s*(\d+)", text)
+ if not lel_fittings_match:
+ logger.error("Failed to extract LEL fittings.")
+ raise ValueError("Failed to extract LEL fittings.")
+ data["Number of LEL Fittings"] = int(lel_fittings_match.group(1))
+ data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"]
+
+ windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
+ if not windows_section:
+ logger.error("Failed to extract window data.")
+ raise ValueError("Failed to extract window data.")
+ data["Windows"] = self.extract_window_age_description(windows_section.group(1))
+
+ data["Primary Heating"] = self.extract_primary_heating(text)
+ data["Secondary Heating"] = self.extract_secondary_heating_details(text)
+ data["Building Parts"] = self.extract_building_parts(text)
+ data["Roof Details"] = self.extract_roof_details(text)
+ data["Wall Details"] = self.extract_wall_details(text)
+ data["Conservatory"] = self.extract_conservatory(text)
+
+ water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text)
+ if not water_heating_code_match:
+ logger.error("Failed to extract water heating code.")
+ raise ValueError("Failed to extract water heating code.")
+ data["Water Heating Code"] = water_heating_code_match.group(1).strip()
+
+ return data
+
+
+class ElmhurstSummaryReportExtractor:
+ """
+ A utility class for extracting specific data from Elmhurst Energy Performance Reports (EPR).
+ """
+
+ def __init__(self, file_path):
+ self.file_path = file_path
+
+ @staticmethod
+ def extract_window_age_description(windows_text):
+ """
+ Extracts the most common window age description and its proportion.
+
+ Parameters:
+ windows_text (str): The text section containing window data.
+
+ Returns:
+ dict: A dictionary with the most common window age description and its proportion.
+ """
+ # Clean up windows_text by removing line breaks for better pattern matching
+ windows_text = windows_text.replace("\n", "")
+
+ # Define possible window age descriptions
+ window_descriptions = [
+ "Double post or during 2002",
+ "Double pre 2002",
+ "Double with unknown install date",
+ "Secondary glazing",
+ "Triple glazing",
+ "Single glazing",
+ ]
+
+ # Count occurrences of each description
+ description_counts = Counter()
+ for description in window_descriptions:
+ matches = re.findall(re.escape(description), windows_text)
+ description_counts[description] = len(matches)
+
+ if not description_counts or not sum(description_counts.values()):
+ raise ValueError("Failed to extract window data.")
+
+ # Determine the most common description and calculate its proportion
+ most_common_description, window_count = description_counts.most_common(1)[0]
+ window_proportion = window_count / sum(description_counts.values()) * 100
+
+ # Get the second most common and the proportion
+ if window_proportion == 100:
+ second_most_common_description = None
+ second_most_common_proportion = 0
+ else:
+ second_most_common_description, second_window_count = description_counts.most_common(2)[1]
+ second_most_common_proportion = second_window_count / sum(description_counts.values()) * 100
+
+ return {
+ "Window Age Description": most_common_description,
+ "Window Age Description Proportion (%)": window_proportion,
+ "Secondary Window Age Description": second_most_common_description,
+ "Secondary Window Age Description Proportion (%)": second_most_common_proportion,
+ "Number of Windows": sum(description_counts.values())
+ }
+
+ @staticmethod
+ def extract_primary_heating(text):
+ primary_heating_section1 = re.search(r"Main\s*Heating1\s*(.*?)\s*Main\s*Heating2", text, re.DOTALL)
+ primary_heating_section2 = re.search(r"Main\s*Heating1\s*(.*?)\s*Water\s*Heating", text, re.DOTALL)
+ primary_heating_section = primary_heating_section1 if primary_heating_section1 else primary_heating_section2
+ if primary_heating_section is None:
+ raise ValueError("Failed to extract primary heating data.")
+
+ primary_text = primary_heating_section.group(1)
+
+ output = {
+ 'System': re.search(r"Main Heating Code\s*(.*?)\n", primary_text).group(1).strip(),
+ 'PCDF Reference': re.search(r"PCDF boiler Reference\s*(\d+)", primary_text).group(1),
+ 'Controls': re.search(r"Main Heating Controls\s*(.*?)\n", primary_text).group(1).strip(),
+ '% of Heat': int(re.search(r"Percentage of Heat\s*(\d+)\s*%", primary_text).group(1))
+ }
+ return output
+
+ @staticmethod
+ def extract_secondary_heating_details(text):
+ secondary_heating_section = re.search(r"Main\s*Heating2\s*(.*?)\s*Water\s*Heating", text, re.DOTALL)
+
+ # Defaults
+ output = {
+ "System": "",
+ "PCDF Reference": "",
+ "Controls": "",
+ "% of Heat": 0,
+ "Heating Code": ""
+ }
+ if secondary_heating_section is not None:
+ # Overwrite defaults
+ secondary_text = secondary_heating_section.group(1)
+
+ main_heating_code_match_secondary = re.search(
+ r"Main Heating Code\s*(.*?)(?=\n|Percentage of Heat)", secondary_text
+ )
+ output["System"] = main_heating_code_match_secondary.group(1).strip()
+ output["PCDF Reference"] = re.search(r"PCDF boiler Reference\s*(\d+)", secondary_text).group(1)
+
+ second_heating_controls_match = re.search(r"Main Heating Controls\s*(.*?)\n", secondary_text)
+ output["Heating Controls"] = (
+ second_heating_controls_match.group(1).strip() if second_heating_controls_match else ""
+ )
+ output["% of Heat"] = int(
+ re.search(r"Percentage of Heat\s*(\d+)\s*%", secondary_text).group(1)
+ )
+
+ secondary_heating_code_match = re.search(r"Secondary Heating Code\s*(.*?)\n", text)
+ if output["System"] != "":
+ output["Heating Code"] = (
+ secondary_heating_code_match.group(1).strip() if secondary_heating_code_match else ""
+ )
+
+ return output
+
+ @staticmethod
+ def extract_building_parts(text):
+ """
+ Extracts building parts and associated dimensions from the summary report PDF.
+ This includes Main Property, multiple extensions if they exist, and Room in Roof areas.
+ """
+ data = []
+
+ # Locate the Dimensions section
+ dimensions_section = re.search(
+ r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL
+ )
+ if not dimensions_section:
+ raise ValueError("Failed to locate dimensions section in the text.")
+
+ dimensions_text = dimensions_section.group(1)
+
+ # Pattern to extract each building part, starting from Main Property and including extensions
+ building_part_pattern = re.compile(
+ r"(Main Property|\d+(?:st|nd|rd|th) Extension)\s*"
+ r"(.*?)(?=\d+(?:st|nd|rd|th) Extension|5\.0 Conservatory|$)",
+ re.DOTALL
+ )
+
+ # Loop through each building part match, including Main Property and extensions
+ for match in building_part_pattern.finditer(dimensions_text):
+ part_name = match.group(1)
+ floor_data = match.group(2)
+
+ # Pattern to extract floor details: Floor Level, Floor Area, Room Height, Perimeter, Party Wall Length
+ floor_pattern = re.compile(
+ r"(1st Floor|Lowest Floor|Second floor):\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
+ )
+
+ # Extract data for each floor within the building part
+ for floor_match in floor_pattern.finditer(floor_data):
+ floor_level = floor_match.group(1)
+ floor_area = float(floor_match.group(2))
+ room_height = float(floor_match.group(3))
+ perimeter = float(floor_match.group(4))
+ party_wall_length = float(floor_match.group(5))
+
+ # Append to data list
+ data.append(
+ {
+ "Building Part": part_name,
+ "Floor Level": floor_level,
+ "Floor Area (m2)": floor_area,
+ "Room Height (m)": room_height,
+ "Perimeter (m)": perimeter,
+ "Party Wall Length (m)": party_wall_length
+ }
+ )
+
+ # Check specifically for "Room(s) in Roof" entries, which only have Floor Area
+ room_in_roof_pattern = re.compile(r"Room\(s\) in Roof:\s*([\d.]+)")
+ room_in_roof_match = room_in_roof_pattern.search(floor_data)
+ if room_in_roof_match:
+ floor_area = float(room_in_roof_match.group(1))
+ data.append(
+ {
+ "Building Part": part_name,
+ "Floor Level": "Room in Roof",
+ "Floor Area (m2)": floor_area,
+ "Room Height (m)": None, # Placeholder for missing data
+ "Perimeter (m)": None, # Placeholder for missing data
+ "Party Wall Length (m)": None # Placeholder for missing data
+ }
+ )
+
+ return data
+
+ @staticmethod
+ def extract_roof_details(text):
+ """
+ Extracts roof type, insulation, and insulation thickness for each building part
+ in the 8.0 Roofs section of the summary report.
+ """
+ # Define data structure to hold results
+ roof_data = []
+
+ # Locate the entire 8.0 Roofs section
+ roof_section_match = re.search(r"8\.0 Roofs:\n(.*?)(?=\n9\.0 Floors:|$)", text, re.DOTALL)
+ if not roof_section_match:
+ return roof_data # Return empty if no roof section is found
+
+ # Extract the roof section and append "9.0 Floors:" as the boundary
+ roof_section = roof_section_match.group(1).strip() + "\n9.0 Floors:"
+
+ # Define pattern to match each building part's roof entry
+ building_part_pattern = re.compile(
+ r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label
+ r"Type\s+(.*?)(?=\n(?:Insulation|9\.0 Floors:|[A-Z]))" # Matches Roof Type until the next field, label,
+ # or end
+ r"(?:\nInsulation\s+(.*?)(?=\n(?:Insulation Thickness|9\.0 Floors:|[A-Z])))?" # Optional Insulation
+ r"(?:\nInsulation Thickness\s+(.*?)(?=\n(?:9\.0 Floors:|[A-Z])))?", # Optional Insulation Thickness
+ re.DOTALL
+ )
+
+ # Extract each building part's data
+ for match in building_part_pattern.finditer(roof_section):
+ part_name = match.group(1).strip() # Building part label
+ roof_type = match.group(2).strip() # Roof Type
+ roof_insulation = match.group(3).strip() if match.group(3) else None # Optional Insulation
+ roof_insulation_thickness = match.group(4).strip() if match.group(4) else None # Optional Thickness
+
+ # Cleaning to handle annoying cases when it comes out like this:
+ # 'A Another dwelling above\n1st Extension'
+ if roof_type.startswith("A Another dwelling above"):
+ roof_type = "A Another dwelling above"
+
+ # Store results for this building part
+ roof_data.append(
+ {
+ "Building Part": part_name,
+ "Roof Type": roof_type,
+ "Roof Insulation": roof_insulation,
+ "Roof Insulation Thickness": roof_insulation_thickness,
+ }
+ )
+
+ return roof_data
+
+ @staticmethod
+ def extract_wall_details(text):
+ """
+ Extracts wall type, insulation, dry-lining, and thickness for each building part,
+ including any alternative wall details within the 7.0 Walls section of the summary PDF text.
+ """
+ # Define data structure to hold all building part wall entries
+ wall_data = []
+
+ # Locate the entire 7.0 Walls section
+ wall_section = re.search(r"7\.0 Walls:\n(.*?)\n8\.0 Roofs:", text, re.DOTALL).group(1)
+
+ # Define pattern to match each building part's wall entry within the section
+ building_part_pattern = re.compile(
+ r"(Main Property|1st Extension|2nd Extension|[\w\s]+)\n" # Matches each building part label
+ r"Type\s+(.*?)\n" # Matches main wall Type
+ r"Insulation\s+(.*?)\n" # Matches main wall Insulation
+ r"(Dry-lining\s+(.*?)\n)?" # Optional main wall Dry-lining
+ r"Wall Thickness Unknown\s+(.*?)\n" # Matches main wall Thickness Unknown
+ r"Wall Thickness \[mm\]\s+(\d+)", # Matches main wall Thickness
+ re.DOTALL
+ )
+
+ # Define pattern to capture alternative wall details, if present
+ alternative_wall_pattern = re.compile(
+ r"Alternative Wall Area.*?\n" # Matches start of alternative wall section
+ r"Alternative Type\s+(.*?)\n" # Matches alternative wall Type
+ r"Alternative Insulation\s+(.*?)\n" # Matches alternative wall Insulation
+ r"(Alternative Dry-lining\s+(.*?)\n)?" # Optional Alternative Dry-lining
+ r"Alternative Wall Thickness Unknown\s+(.*?)\n" # Matches alternative wall Thickness Unknown
+ r"Alternative Wall Thickness\s+(\d+)", # Matches alternative wall Thickness
+ re.DOTALL
+ )
+
+ # Find all building part entries within the 7.0 Walls section
+ for match in building_part_pattern.finditer(wall_section):
+ wall_label = match.group(1).strip()
+ main_wall_type = match.group(2).strip()
+ main_wall_insulation = match.group(3).strip()
+ main_wall_dry_lining = match.group(5).strip() if match.group(5) else "N/A"
+ main_wall_thickness_unknown = match.group(6).strip()
+ main_wall_thickness = int(match.group(7))
+
+ # Initialize dictionary for this wall entry
+ wall_entry = {
+ "Building Part": wall_label,
+ "Wall Type": main_wall_type,
+ "Wall Insulation": main_wall_insulation,
+ "Wall Dry-lining": main_wall_dry_lining,
+ "Wall Thickness Unknown": main_wall_thickness_unknown,
+ "Wall Thickness (mm)": main_wall_thickness,
+ "Alternative Wall Type": None,
+ "Alternative Wall Insulation": None,
+ "Alternative Wall Dry-lining": "N/A",
+ "Alternative Wall Thickness Unknown": None,
+ "Alternative Wall Thickness (mm)": None,
+ }
+
+ # Check if there's an alternative wall section following this wall entry
+ alt_match = alternative_wall_pattern.search(wall_section, match.end())
+ if alt_match:
+ wall_entry["Alternative Wall Type"] = alt_match.group(1).strip()
+ wall_entry["Alternative Wall Insulation"] = alt_match.group(2).strip()
+ wall_entry["Alternative Wall Dry-lining"] = alt_match.group(4).strip() if alt_match.group(4) else "N/A"
+ wall_entry["Alternative Wall Thickness Unknown"] = alt_match.group(5).strip()
+ wall_entry["Alternative Wall Thickness (mm)"] = int(alt_match.group(6))
+
+ # Append each building part as a dictionary in the wall_data list
+ wall_data.append(wall_entry)
+
+ return wall_data
+
+ @staticmethod
+ def extract_conservatory(text):
+ """
+ Extracts conservatory data from the provided text.
+ The section is located between "5.0 Conservatory" and "7.0 Walls".
+
+ Args:
+ text (str): The full text of the Summary Report PDF.
+
+ Returns:
+ dict: A dictionary with conservatory details:
+ - "Conservatory Present"
+ - "Conservatory Separated"
+ - "Conservatory Floor Area"
+ - "Conservatory Double Glazed"
+ - "Conservatory Glazed Perimeter"
+ - "Heated Conservatory Height"
+ """
+
+ # Extract the section between "5.0 Conservatory" and "7.0 Walls"
+ conservatory_match = re.search(r"5\.0 Conservatory:(.*?)7\.0 Walls:", text, re.DOTALL)
+ if not conservatory_match:
+ logger.error("Failed to extract conservatory data.")
+ raise ValueError("Could not extract conservatory data.")
+
+ conservatory_text = conservatory_match.group(1)
+
+ # Check if conservatory is present
+ present_match = re.search(r"Is there a conservatory\?\s*(Yes|No)", conservatory_text, re.IGNORECASE)
+
+ if not present_match or present_match.group(1).strip().lower() == "no":
+ return {
+ "Conservatory Present": "No",
+ "Conservatory Separated": "",
+ "Conservatory Floor Area": 0,
+ "Conservatory Double Glazed": "",
+ "Conservatory Glazed Perimeter": 0,
+ "Heated Conservatory Height": "",
+ }
+
+ # If we get here, raise a temporary exception since we've not seen a case of this, so should make sure
+ # this is correct
+
+ separated_match = re.search(r"Is it thermally separated\?\s*(Yes|No)", conservatory_text, re.IGNORECASE)
+ floor_area_match = re.search(r"Floor Area \[m2\]\s*([\d.]+)", conservatory_text, re.IGNORECASE)
+ double_glazed_match = re.search(r"Double Glazed\s*(Yes|No)", conservatory_text, re.IGNORECASE)
+ glazed_perimeter_match = re.search(r"Glazed Perimeter \[m\]\s*([\d.]+)", conservatory_text, re.IGNORECASE)
+ height_match = re.search(r"Room Height\s*(.*?)(?=\n|$)", conservatory_text, re.IGNORECASE)
+
+ return {
+ "Conservatory Present": "Yes",
+ "Conservatory Separated": separated_match.group(1).strip() if separated_match else "",
+ "Conservatory Floor Area": float(floor_area_match.group(1)) if floor_area_match else 0,
+ "Conservatory Double Glazed": double_glazed_match.group(1).strip() if double_glazed_match else "",
+ "Conservatory Glazed Perimeter": float(glazed_perimeter_match.group(1)) if glazed_perimeter_match else 0,
+ "Heated Conservatory Height": height_match.group(1).strip() if height_match else "",
+ }
+
+ def extract(self):
+ """
+ Extracts specific data from the provided PDF file.
+ Data includes:
+ - Current SAP rating
+ - Fuel Bill
+ - Address
+ """
+
+ data = {}
+ with (open(self.file_path, "rb") as file):
+ reader = PyPDF2.PdfReader(file)
+ text = ""
+ for page in reader.pages:
+ text += page.extract_text()
+
+ # Match and extract
+ name_match = re.search(r"Name:\s*([A-Za-z\s]+)\s*Title:\s*([A-Za-z\.]+)", text)
+ if not name_match:
+ raise ValueError("Couldn't extract surveyor name")
+ data["Assessor Name"] = name_match.group(2).strip() + " " + name_match.group(1).strip()
+ data["Assessment Date"] = re.search(r"Inspection Date:\s*(.*?)\n", text).group(1).strip()
+
+ # Address and postcode
+ postcode = re.search(r"Postcode:\s*(.*?)\nRegion:", text)
+ postcode = postcode.group(1).strip() if postcode else ""
+
+ region = re.search(r"Region:\s*(.*?)\nHouse Name:", text)
+ region = region.group(1).strip() if region else ""
+
+ house_name = re.search(r"House Name:\s*(.*?)\nHouse No:", text)
+ house_name = house_name.group(1).strip() if house_name else ""
+
+ house_no = re.search(r"House No:\s*(.*?)\nStreet:", text)
+ house_no = house_no.group(1).strip() if house_no else ""
+
+ street = re.search(r"Street:\s*(.*?)\nLocality:", text)
+ street = street.group(1).strip() if street else ""
+
+ locality = re.search(r"Locality:\s*(.*?)\nTown:", text)
+ locality = locality.group(1).strip() if locality else ""
+
+ town = re.search(r"Town:\s*(.*?)\nCounty:", text)
+ town = town.group(1).strip() if town else ""
+
+ county = re.search(r"County:\s*(.*?)\nProperty Tenure:", text)
+ county = county.group(1).strip() if county else ""
+
+ # Clean extracted values and remove any prefixes
+ address_parts = [
+ house_no,
+ house_name,
+ street,
+ locality,
+ town,
+ county,
+ region,
+ postcode
+ ]
+
+ # Join non-empty parts with a comma
+ data["Address"] = ", ".join([part for part in address_parts if part])
+ data["Postcode"] = postcode
+ data["Region"] = region
+ data["House Name"] = house_name
+ data["House No"] = house_no
+ data["Street"] = street
+ data["Locality"] = locality
+ data["Town"] = town
+ data["County"] = county
+
+ # Extract Current SAP rating
+ sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)
+ if not sap_match:
+ raise ValueError("Could not extract SAP rating")
+ data["Current SAP Rating"] = sap_match.group(1).split(" ")[1]
+
+ # We don't have primary energy in the summary report
+ data['Primary Energy Use Intensity (kWh/m2/yr)'] = None
+
+ # Number of storeys
+ storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
+ if not storeys_match:
+ raise ValueError("Could not extract number of storeys")
+ data["Number of Storeys"] = int(storeys_match.group(1))
+
+ # Extract Fuel Bill
+ fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text)
+ if not fuel_bill_match:
+ raise ValueError("Could not extract fuel bill")
+ data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
+
+ # Extract Total Number of Doors
+ total_doors_match = re.search(r"Total Number of Doors\s*(\d+)", text)
+ if not total_doors_match:
+ raise ValueError("Could not extract total number of doors")
+ data["Total Number of Doors"] = int(total_doors_match.group(1))
+
+ # Extract Number of Insulated Doors
+ insulated_doors_match = re.search(r"Number of Insulated Doors\s*(\d+)", text)
+ if not insulated_doors_match:
+ raise ValueError("Could not extract number of insulated doors")
+ data["Number of Insulated Doors"] = int(insulated_doors_match.group(1))
+
+ # lighting
+ data["Number of Light Fittings"] = int(re.search(r"Total number of light fittings\s*(\d+)", text).group(1))
+ data["Number of LEL Fittings"] = int(re.search(r"Total number of L.E.L. fittings\s*(\d+)", text).group(1))
+ data["Number of fittings needing LEL"] = data["Number of Light Fittings"] - data["Number of LEL Fittings"]
+
+ windows_section = re.search(r"Windows\s*(.*?)\s*Draught Proofing", text, re.DOTALL)
+ if not windows_section:
+ raise ValueError("Failed to extract window data.")
+ data["Windows"] = self.extract_window_age_description(windows_section.group(1))
+
+ data["Primary Heating"] = self.extract_primary_heating(text)
+ data["Secondary Heating"] = self.extract_secondary_heating_details(text)
+ data["Building Parts"] = self.extract_building_parts(text)
+ data["Roof Details"] = self.extract_roof_details(text)
+ data["Wall Details"] = self.extract_wall_details(text)
+ data["Conservatory"] = self.extract_conservatory(text)
+
+ water_heating_code_match = re.search(r"Water Heating Code\s*(.*?)\n", text)
+ if not water_heating_code_match:
+ raise ValueError("Failed to extract water heating code.")
+
+ data["Water Heating Code"] = water_heating_code_match.group(1).strip()
+
+ return data
+
+
+class PulseAirPermeabilityExtractor:
+ """
+ A utility class for extracting specific data from Pulse Air Permeability Test Reports.
+ """
+
+ def __init__(self, file_path):
+ self.file_path = file_path
+
+ @staticmethod
+ def extract_table(text):
+ patterns = {
+ "Air Leakage Rate": r"Air Leakage Rate\s*([\d,@.]+)\s*m/h\s*([\d,@.]+)\s*m3/h",
+ "Air Permeability": r"Air Permeability\s*([\d,@.]+)\s*=.*?\s*([\d,@.]+)\s*m\?/m\?h",
+ "Air Changes per Hour": r"Air Changes per Hour\s*([\d,@.]+)\s*([\d,@.]+)",
+ "Equivalent Leakage Area": r"Equivalent Leakage Area\s*([\d,@.]+)\s*([\d,@.]+)",
+ "Calculation Uncertainty": r"Calculation Uncertainty\s*([\d,@.]+)\s*([\d,@.]+)",
+ }
+
+ # Initialize results dictionary
+ table_data = []
+
+ # Parse each metric using the corresponding regex
+ for metric, pattern in patterns.items():
+ match = re.search(pattern, text)
+ if match:
+ # Extract the two column values
+ first_value = match.group(1)
+ second_value = match.group(2)
+
+ # Post-process values: replace '@' with '0' and remove commas
+ first_value = first_value.replace("@", "0").replace(",", "")
+ second_value = second_value.replace("@", "0").replace(",", "")
+
+ table_data.append(
+ {
+ "Metric": metric,
+ "Measured @ 4PA": first_value,
+ "Extrapolated @ 50PA": second_value,
+ }
+ )
+ else:
+ raise ValueError(f"Could not extract metric: {metric}")
+
+ return table_data
+
+ def extract(self):
+ # Extract the pdf using tesseract
+ logger.info("Extracting data from pdf image - this may take a while...")
+ pages = convert_from_path(self.file_path, dpi=300)
+ # Extract all of the pages
+ text = ""
+ for page in pages:
+ text += image_to_string(page)
+
+ # We extract the air permeability reading
+ results_table = self.extract_table(text)
+ data = {
+ "Results Table": results_table
+ }
+
+ return data
+
+
+class ElmhurstProjectHandoverExtractor:
+ """
+ A utility class for extracting specific data from The Elmhurst Project Handover document
+ """
+
+ def __init__(self, file_path):
+ self.file_path = file_path
+
+ def extract(self):
+
+ with (open(self.file_path, "rb") as file):
+ reader = PyPDF2.PdfReader(file)
+ text = ""
+ for page in reader.pages:
+ text += page.extract_text()
+
+ data = {}
+
+ # Regex patterns
+ patterns = {
+ "Retrofit Coordinator Name": r"Retrofit Coordinator Name:\s*(.+)",
+ "Retrofit Coordinator ID": r"Retrofit Coordinator ID:\s*(\d+)",
+ "Measures Fitted": r"Measure\(s\) Fitted:\s*([\s\S]*?)\nRetrofit Assessor Name:",
+ "Designer Name": r"Designer Name\(s\):\s*(.+)",
+ "Installer Name": r"Installer Name\(s\):\s*(.+)",
+ }
+
+ # Extract data
+ for key, pattern in patterns.items():
+ match = re.search(pattern, text)
+ if not match:
+ raise ValueError(f"Could not match {key}")
+ if match:
+ if key == "Measures Fitted":
+ # Special handling for multiline measures
+ measures = re.findall(r"[\u2022\u00b7\u25cf\uf0b7]\s*(.+)", match.group(1))
+ measures = [m.strip() for m in measures]
+ data[key] = measures
+ else:
+ data[key] = match.group(1).strip() if match else ""
+
+ return data
+
+
+class CoreLogicPasAssessmentReportExtractor:
+ """
+ A utility class for extracting specific data from CoreLogic PAS Assessment Reports.
+ """
+
+ def __init__(self, file_path):
+ self.file_path = file_path
+
+ def extract(self):
+ data = {}
+
+ with pdfplumber.open(self.file_path) as pdf:
+ for page in pdf.pages:
+ tables = page.extract_tables()
+ if tables: # If tables are detected on the page
+ for table in tables:
+ for row in table:
+ # Check if the row contains "Number of bedrooms"
+ if any("Number of bedrooms" in str(cell) for cell in row):
+ # Extract the corresponding value by filtering out None and non-relevant cells
+ for cell in row:
+ if cell and cell.strip().isdigit(): # Check if cell contains a numeric value
+ data["Number of bedrooms"] = int(cell.strip())
+ break # Stop further processing once value is found
+
+ return data
diff --git a/utils/fullSapParser.py b/utils/fullSapParser.py
new file mode 100644
index 00000000..540eff6f
--- /dev/null
+++ b/utils/fullSapParser.py
@@ -0,0 +1,306 @@
+import boto3
+from xml.dom.minidom import parseString
+
+PROPERTY_AGE_BAND = {
+ "A": "before 1900",
+ "B": "1900-1929",
+ "C": "1930-1949",
+ "D": "1950-1966",
+ "E": "1967-1975",
+ "F": "1976-1982",
+ "G": "1983-1990",
+ "H": "1991-1995",
+ "I": "1996-2002",
+ "J": "2003-2006",
+ "K": "2007-2011",
+ "L": "2012 onwards"
+}
+
+POSITION_OF_FLAT = {
+ "TopFloorFlat": "(top floor)"
+}
+
+MAINHEATING_LOOKUP = {
+ "SEB": "Electric (SEB modern slimline storage heaters)"
+}
+
+WINDOWS_YEAR_LOOKUP = {
+ "unknown install date": "unknown year",
+ "unknown install": "unknown year",
+ "post or during 2002": "2002 onwards",
+}
+
+
+class FullSapParser:
+ full_address = None
+ archetype = None
+ age_band = None
+ unheated_corridor = None
+ property_type = None
+ built_form = None
+
+ # ventilation
+ mechanical_ventilation = None
+ cross_ventilation = None
+ night_ventilation = None
+
+ # dimensions
+ number_of_storeys = None
+ property_dimensions = None
+
+ # fabric
+ low_energy_lighting = None
+
+ # Heating
+ heating1 = None
+ cylinder = None
+ cylinder_stat = None
+
+ def __init__(self, filekey, bucket_name=None):
+ self.s3_client = boto3.client('s3')
+ self.bucket_name = bucket_name
+ self.filekey = filekey
+ self.full_sap = None
+
+ self._read_file()
+
+ def _read_file(self):
+ """
+ Reads the XML file either locally or from S3 and parses it using minidom.
+
+ Raises:
+ ValueError: If the file cannot be found, read, or parsed.
+ """
+ try:
+ if self.bucket_name:
+ # Read from S3
+ response = self.s3_client.get_object(Bucket=self.bucket_name, Key=self.filekey)
+ xml_content = response['Body'].read()
+ else:
+ # Read locally
+ with open(self.filekey, "r") as f:
+ xml_content = f.read()
+
+ # Parse the XML content using minidom
+ self.full_sap = parseString(xml_content)
+ except FileNotFoundError:
+ raise ValueError(f"Local file not found: {self.filekey}")
+ except Exception as e:
+ raise ValueError(f"An error occurred while reading or parsing the XML: {e}")
+
+ def extract(self, _return=True):
+ self.get_address()
+ self.get_archetype()
+ self.get_age_band()
+ self.get_unheated_corridor()
+ self.get_heating_1()
+ self.get_ventilation()
+ self.get_floor_area()
+ self.get_low_energy_lighting()
+ self.get_cylinder()
+
+ if _return:
+ return {
+ "Property Type": self.property_type,
+ "Built Form": self.built_form,
+ "Age Band": self.age_band,
+ }
+
+ def get_address(self):
+ if not self.full_sap:
+ raise ValueError("You need to read the file first")
+
+ address = self.full_sap.getElementsByTagName("AddressAsDesigned")
+ if len(address) != 1:
+ raise ValueError("Non-unique address tag found - investigate me")
+
+ address = address[0]
+ data = {}
+ for node in address.childNodes:
+ if node.nodeType == node.ELEMENT_NODE:
+ data[node.nodeName] = node.firstChild.nodeValue if node.firstChild else None
+
+ self.full_address = " ".join(
+ [
+ x.title() for x in [data["AddressLine1"], data["AddressLine2"], data["AddressLine3"], data["Town"]]
+ if x is not None
+ ]
+ ) + " " + data["Postcode"]
+
+ def get_archetype(self):
+ if not self.full_sap:
+ raise ValueError("You need to read the file first")
+
+ property_type1 = self.full_sap.getElementsByTagName('PropertyType1')
+ property_type2 = self.full_sap.getElementsByTagName('PropertyType2')
+ position_of_flat = self.full_sap.getElementsByTagName('PositionOfFlat')
+
+ if len(property_type1) != 1 or len(property_type2) != 1:
+ raise ValueError("Non-unique property tag found - investigate me")
+
+ property_type1 = property_type1[0].firstChild.nodeValue
+ property_type2 = property_type2[0].firstChild.nodeValue
+ if position_of_flat[0].firstChild:
+ position_of_flat = POSITION_OF_FLAT[position_of_flat[0].firstChild.nodeValue]
+ else:
+ position_of_flat = None
+
+ self.property_type = property_type1
+ self.built_form = property_type2
+ self.archetype = property_type1 + " - " + property_type2
+
+ if position_of_flat:
+ self.archetype = self.archetype + " " + position_of_flat
+
+ def get_age_band(self):
+ if not self.full_sap:
+ raise ValueError("You need to read the file first")
+
+ property_age_band = self.full_sap.getElementsByTagName('PropertyAgeBand')
+
+ if len(property_age_band) != 1:
+ raise ValueError("Non-unique property age band tag found - investigate me")
+
+ property_age_band = property_age_band[0].firstChild.nodeValue
+ self.age_band = PROPERTY_AGE_BAND[property_age_band]
+
+ def get_wall_area_for_description(self, description):
+ wall_recs = self.full_sap.getElementsByTagName("WallRec")
+ for wall_rec in wall_recs:
+ desc_elements = wall_rec.getElementsByTagName("Description")
+ if desc_elements and desc_elements[0].firstChild.data == description:
+ area_elements = wall_rec.getElementsByTagName("Area")
+ if area_elements:
+ area = float(area_elements[0].firstChild.data)
+ # Placeholder for wall_description which you'll populate later
+ return f"Unheated corridor - {area} area"
+ return None
+
+ def get_unheated_corridor(self):
+ """
+ Unheated corridors don't always exist so we'll need to search for it
+ :return:
+ """
+
+ if not self.full_sap:
+ raise ValueError("You need to read the file first")
+
+ self.unheated_corridor = self.get_wall_area_for_description("Flat corridor Main")
+
+ def get_heating_1(self):
+
+ if not self.full_sap:
+ raise ValueError("You need to read the file first")
+
+ main_heating_system = self.full_sap.getElementsByTagName('MainHeatingSystem1')
+
+ if len(main_heating_system) != 1:
+ raise ValueError("Non-unique main heating system tag found - investigate me")
+
+ main_heating_system = main_heating_system[0]
+
+ mhs = main_heating_system.getElementsByTagName('MHS')[0].firstChild.nodeValue
+ mhs = MAINHEATING_LOOKUP.get(mhs, mhs)
+
+ fraction = main_heating_system.getElementsByTagName('Fraction')[0].firstChild.nodeValue
+
+ self.heating1 = f"{mhs} : {fraction}% of heating"
+
+ def get_ventilation(self):
+
+ bool_lookup = {
+ "true": True,
+ "false": False
+ }
+
+ # Extract MechanicalVentilationDecentralised
+ mech_vent = self.full_sap.getElementsByTagName("MechanicalVentilationDecentralised")
+ if mech_vent and mech_vent[0].childNodes:
+ mech_vent_value = mech_vent[0].firstChild.nodeValue
+ else:
+ mech_vent_value = None
+
+ # Extract CrossVentilation
+ cross_vent = self.full_sap.getElementsByTagName("CrossVentilation")
+ if cross_vent and cross_vent[0].childNodes:
+ cross_vent_value = cross_vent[0].firstChild.nodeValue
+ cross_vent_value = bool_lookup.get(cross_vent_value, cross_vent_value)
+ else:
+ cross_vent_value = None
+
+ # Extract NightVentilation
+ night_vent = self.full_sap.getElementsByTagName("NightVentilation")
+ if night_vent and night_vent[0].childNodes:
+ night_vent_value = night_vent[0].firstChild.nodeValue
+ night_vent_value = bool_lookup.get(night_vent_value, night_vent_value)
+ else:
+ night_vent_value = None
+
+ # Create the outputs
+ self.mechanical_ventilation = "Mechanical ventilation present" if mech_vent_value else "No mechanical " \
+ "ventilation"
+ self.cross_ventilation = "Cross ventilation present" if cross_vent_value else "No cross ventilation"
+ self.night_ventilation = "Night ventilation present" if night_vent_value else "No night ventilation"
+
+ def get_floor_area(self):
+
+ self.number_of_storeys = int(self.full_sap.getElementsByTagName('NumberOfStoreys')[0].firstChild.nodeValue)
+ storeys = self.full_sap.getElementsByTagName('StoreyMeasurementRec')
+
+ # TODO: The first StoreyMeasurementRec tag looks like this in the examples we've seen:
+ #
+ # Indicating that the tag is explicitly indicated as empty
+
+ storey_data = []
+ storey_index = -1
+ for storey in storeys:
+ storey_index += 1
+
+ if storey.getAttribute("xsi:nil") == "true":
+ continue
+
+ if storey_index == -1:
+ raise NotImplementedError(
+ "Investigated me - potentially basement found but need to confirm with Basement tag"
+ )
+
+ floor_area = storey.getElementsByTagName('InternalFloorArea')
+ if not floor_area:
+ continue
+
+ floor_area = float(floor_area[0].firstChild.nodeValue)
+ # If floor area is 0, skip this storey
+ if not floor_area:
+ continue
+
+ perimeter = float(storey.getElementsByTagName('InternalPerimeter')[0].firstChild.nodeValue)
+ height = float(storey.getElementsByTagName('StoreyHeight')[0].firstChild.nodeValue)
+
+ storey_data.append({
+ "storey_index": storey_index,
+ "Floor Area": floor_area,
+ "Perimeter": perimeter,
+ "Height": height
+ })
+
+ # We will convert this into a table in the markdown
+ self.property_dimensions = storey_data
+
+ def get_low_energy_lighting(self):
+ # Extract the values of the LightFittings and LELFittings tags
+ light_fittings = self.full_sap.getElementsByTagName('LightFittings')[0].firstChild.data
+ lel_fittings = self.full_sap.getElementsByTagName('LELFittings')[0].firstChild.data
+
+ # Construct the string message
+ self.low_energy_lighting = f"{lel_fittings} out of {light_fittings} lighting fittings are low energy."
+
+ def get_cylinder(self):
+ insulation_type = self.full_sap.getElementsByTagName('InsulationType')[0].firstChild.data
+ insulation_thickness = self.full_sap.getElementsByTagName('InsulationThickness')[0].firstChild.data
+
+ if insulation_type and insulation_thickness:
+ self.cylinder = f"Insulated, {insulation_type}: {insulation_thickness}mm."
+ else:
+ self.cylinder = "Not insulated."
+
+ self.cylinder_stat = self.full_sap.getElementsByTagName('CylinderStat')[0].firstChild.data