diff --git a/.idea/terraform.xml b/.idea/terraform.xml
new file mode 100644
index 00000000..cd46a3d3
--- /dev/null
+++ b/.idea/terraform.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/asset_list/AssetList.py b/asset_list/AssetList.py
new file mode 100644
index 00000000..306edd99
--- /dev/null
+++ b/asset_list/AssetList.py
@@ -0,0 +1,1518 @@
+import hashlib
+import os
+import re
+import tiktoken
+from pprint import pprint
+from datetime import datetime
+from openai import OpenAI
+import numpy as np
+import pandas as pd
+from fuzzywuzzy import process
+from utils.logger import setup_logger
+from backend.SearchEpc import SearchEpc
+from BaseUtility import Definitions
+import asset_list.mappings.property_type as property_type_mappings
+import asset_list.mappings.walls as walls_mappings
+import asset_list.mappings.heating_systems as heating_mappings
+import asset_list.mappings.exising_pv as existing_pv_mappings
+
+from recommendations.recommendation_utils import (
+ estimate_perimeter,
+ estimate_external_wall_area,
+ estimate_number_of_floors
+)
+
+from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
+
+logger = setup_logger()
+
+# OpenAI API Key (set this in your environment variables for security)
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+
+
+class DataRemapper:
+ def __init__(self, standard_values, standard_map=None, max_tokens=1000):
+ """
+ Initialize the remapper with standard values and a predefined mapping.
+
+ :param standard_values: Set of allowed standardized values.
+ :param standard_map: Dictionary of common remappings {raw_value: standard_value}.
+ """
+ self.standard_values = standard_values
+ self.standard_map = standard_map
+ self.fuzzy_threshold = 90 # Adjust fuzzy matching sensitivity
+ self.ai_model = "gpt-4-turbo" # Use gpt-3.5-turbo for cheaper processing
+
+ # Tokenizer for counting tokens
+ self.tokenizer = tiktoken.encoding_for_model(self.ai_model)
+
+ # Track token usage and remap dictionary
+ self.total_tokens_used = 0
+ self.total_cost = 0
+ self.remap_dict = {} # {original_value: standardized_value}
+ self.max_tokens = max_tokens # Limit for OpenAI API
+
+ # Memoization for AI calls
+ self.ai_cache = {} # {tuple(unmapped_values): {original_value: standardized_value}}
+ # Capture the reponse for debugging
+ self.ai_response = None
+
+ # OpenAI pricing (as of Feb 2024)
+ self.pricing = {
+ "gpt-4-turbo": {"input": 0.01 / 1000, "output": 0.03 / 1000},
+ "gpt-3.5-turbo": {"input": 0.0015 / 1000, "output": 0.002 / 1000},
+ }
+
+ self.openai_client = OpenAI(api_key=OPENAI_API_KEY)
+
+ @staticmethod
+ def clean_string(text):
+ """Basic text cleaning: remove extra spaces, punctuation, and normalize case."""
+ if not isinstance(text, str):
+ return None
+ text = text.strip().lower()
+ text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
+ # Replace double strings
+ text = re.sub(r'\s+', ' ', text)
+ return text
+
+ def fuzzy_match(self, text):
+ """Use fuzzy matching to find the closest standard value."""
+ match, score = process.extractOne(text, self.standard_values) if text else (None, 0)
+ return match if score >= self.fuzzy_threshold else None
+
+ def count_tokens(self, text):
+ """Estimate the number of tokens in a given text."""
+ return len(self.tokenizer.encode(text)) if text else 0
+
+ def ai_standardize(self, unmapped_values):
+ """Call OpenAI API **once** for all unmapped values to minimize cost, with memoization."""
+ if not unmapped_values:
+ return {}
+
+ unmapped_tuple = tuple(sorted(unmapped_values)) # Ensure consistency for memoization
+ if unmapped_tuple in self.ai_cache:
+ return self.ai_cache[unmapped_tuple] # Return memoized result
+
+ prompt = f"""
+ You are an expert in data classification. Standardize each of these values into one of the categories:
+ {list(self.standard_values)}.
+
+ Return only a JSON dictionary where:
+ - The keys are the original values.
+ - The values are the standardized ones.
+
+ Strictly return JSON **without markdown formatting** or extra text.
+
+ Example Output:
+ {{
+ "BLKHOUS": "block house",
+ "BEDSIT": "bedsit"
+ }}
+
+ Values to standardize:
+ {unmapped_values}
+ """
+
+ # Count input tokens
+ input_tokens = self.count_tokens(prompt)
+ if input_tokens > self.max_tokens:
+ raise ValueError("Input tokens exceed the maximum limit.")
+
+ logger.info("Calling OpenAI API for standardization...")
+ response = self.openai_client.chat.completions.create(
+ model=self.ai_model,
+ messages=[{"role": "user", "content": prompt}],
+ max_tokens=self.max_tokens,
+ temperature=0.1,
+ )
+
+ output_text = response.choices[0].message.content.strip()
+ output_tokens = self.count_tokens(output_text) # Count output tokens
+
+ # Track total token usage
+ self.total_tokens_used += input_tokens + output_tokens
+
+ # Estimate cost
+ input_cost = input_tokens * self.pricing[self.ai_model]["input"]
+ output_cost = output_tokens * self.pricing[self.ai_model]["output"]
+ self.total_cost += input_cost + output_cost
+
+ try:
+ # Parse response as dictionary
+ mapping = eval(output_text) # OpenAI should return a valid dictionary
+ except:
+ mapping = {val: "unknown" for val in unmapped_values} # Fallback
+
+ # Memoize the AI response
+ self.ai_cache[unmapped_tuple] = mapping
+ # We store the raw AI response for debugging
+ logger.debug(f"AI Response: {mapping}")
+ self.ai_response = output_text
+
+ return mapping
+
+ def standardize_list(self, values_to_remap):
+ """
+ Standardizes a list of values and returns a dictionary {original_value: standardized_value}.
+
+ :param values_to_remap: List of raw values to standardize.
+ :return: Dictionary {original_value: standardized_value}.
+ """
+ unique_values = set(values_to_remap) # Process only unique values
+
+ unmapped_values = []
+ for value in unique_values:
+ if pd.isna(value): # Handle NaN values
+ self.remap_dict[value] = "unknown"
+ continue
+
+ cleaned_value = self.clean_string(value)
+
+ # Rule-Based Check (Predefined Mapping)
+ if cleaned_value in self.standard_map or value in self.standard_map:
+ self.remap_dict[value] = (
+ self.standard_map[cleaned_value] if cleaned_value in self.standard_map else self.standard_map[value]
+ )
+ continue
+
+ if value.lower() in self.standard_map:
+ self.remap_dict[value] = self.standard_map[value.lower()]
+ continue
+
+ # Exact Match in Standard Values
+ if cleaned_value in self.standard_values:
+ self.remap_dict[value] = cleaned_value
+ continue
+
+ # Fuzzy Matching
+ fuzzy_match = self.fuzzy_match(cleaned_value)
+ if fuzzy_match:
+ self.remap_dict[value] = fuzzy_match
+ continue
+
+ # Capture anything that wasn't mapped
+ unmapped_values.append(value)
+
+ # AI Model - remap anything unmapped (batch request)
+ ai_mapping = self.ai_standardize(unmapped_values)
+ self.remap_dict.update(ai_mapping)
+
+ return self.remap_dict
+
+ def report_usage(self):
+ """Prints a summary of token usage and cost."""
+ print(f"\n🔹 Total Tokens Used: {self.total_tokens_used}")
+ print(f"💰 Estimated Cost: ${self.total_cost:.4f}")
+
+
+class AssetList:
+ """
+ This class is used to standardise asset lists so that we can process the core information in a consistent manner.
+ """
+
+ EPC_API_DATA_NAMES = {
+ "uprn": "epc_os_uprn",
+ "address1": "epc_address1",
+ "address": "epc_address",
+ "postcode": "epc_postcode",
+ "inspection-date": "epc_inspection_date",
+ "current-energy-efficiency": "epc_sap_score_on_register",
+ "current-energy-rating": "epc_rating_on_register",
+ "property-type": "epc_property_type",
+ "built-form": "epc_archetype",
+ "total-floor-area": "epc_total_floor_area",
+ "construction-age-band": "epc_age_band",
+ "floor-height": "epc_floor_height",
+ "number-habitable-rooms": "epc_number_habitable_rooms",
+ "walls-description": "epc_wall_construction",
+ "roof-description": "epc_roof_construction",
+ "floor-description": "epc_floor_construction",
+ "mainheat-description": "epc_heating_type",
+ 'mainheatcont-description': "epc_heating_controls",
+ "secondheat-description": "epc_secondary_heating",
+ "transaction-type": "epc_reason",
+ "energy-consumption-current": "epc_heat_demand",
+ "photo-supply": "epc_photo_supply",
+ "estimated": "estimated"
+ }
+ FIND_EPC_DATA_NAMES = {
+ "heating_text": "epc_estiamted_heating_kwh",
+ "hot_water_text": "epc_estimated_hotwater_kwh",
+ 'Assessor’s name': "epc_assessor_name",
+ "Assessor's Telephone": "epc_assessor_telephone",
+ "Assessor's Email": "epc_assessor_email",
+ "Accreditation scheme": "epc_assessor_accreditation",
+ "Assessor’s ID": "epc_assessor_id",
+ "Solar photovoltaics": "epc_solar_pv"
+ }
+
+ DATETIME_REMAP = {
+ "Pre 1900": datetime(year=1899, month=12, day=31),
+ }
+
+ # These are the accepted methods we have for cleaning the address1 column
+ ADDRESS_1_CLEANING_METHODS = [
+ "first_two_words", # This method will split on the fist two words, where the separator is a space
+ "first_word", # This method will split on the first word, where the separator is a space
+ "house_number_extraction", # This method will use the NLP model in SearchEPC to extract the housenumber
+ # "address1_extraction" # This method will use the NLP model to extract address1
+ ]
+
+ # Standard column Names
+ STANDARD_ADDRESS_1 = "domna_address_1"
+ STANDARD_POSTCODE = "domna_postcode"
+ STANDARD_FULL_ADDRESS = "domna_full_address"
+ STANDARD_YEAR_BUILT = "landlord_year_built"
+ STANDARD_UPRN = "ordnance_survey_uprn"
+ STANDARD_LANDLORD_PROPERTY_ID = "landlord_property_id"
+ STANDARD_PROPERTY_TYPE = "landlord_property_type"
+ STANDARD_WALL_CONSTRUCTION = "landlord_wall_construction"
+ STANDARD_HEATING_SYSTEM = "landlord_heating_system"
+ STANDARD_EXISTING_PV = "landlord_existing_pv"
+
+ DOMNA_PROPERTY_ID = "domna_property_id"
+
+ # Regular expression for identifying if the address might point to multiple units
+ MULTI_UNIT_REGEX = re.compile(r'\b([A-Za-z0-9]+)-([A-Za-z0-9]+)\b')
+
+ # List of columns relating to the non-intrusive data
+ NON_INTRUSIVES_COLNAMES = [
+ "Archetype", "Construction", "Insulated", "Material", "CIGA Check Required",
+ "PV, ACCESS ISSUE, SEE NOTES", "OFF GAS - ROOF ORIENTATION",
+ "Any further surveyor notes", 'Surveyors Name'
+ ]
+
+ # This SAP threshold is a key search criteria for properties that may be eligible for extraction
+ FILLED_CAVITY_SAP_THRESHOLD = 75
+ # This SAP the
+ EMPTY_CAVITY_SAP_THRESHOLD = 75
+ # Any EPC deemed to have been conducted prior to this year is deemed to be unreliable
+ EPC_YEAR_THRESHOLD = pd.Timestamp.now().year - 5
+
+ # Attributes - these are columns that we produce, calcualted based on other pieces of data
+ ATTRIBUTE_HAS_SOLAR = "attribute_has_solar"
+ ATTRIBUTE_NUMBER_OF_FLOORS = "attribute_est_number_floors"
+ ATTRIBUTE_ESTIMATED_PERIMETER = "attribute_est_perimter"
+ ATTRIBUTE_HEAT_LOSS_AREA = "attribute_heat_loss_area"
+ ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS = "attribute_epc_roof_insulation_thickness"
+ ATTRIBUTE_SAP_THRESHOLD_AND_BELOW = f"sap_rating_{FILLED_CAVITY_SAP_THRESHOLD}_and_below"
+ ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD = f"epc_is_pre_{EPC_YEAR_THRESHOLD}"
+
+ # These are the descriptions that we look for in the EPC data that are indicative of no insulation
+ EPC_NO_WALL_INSULATION_DESCRIPTIONS = [
+ "cavity wall, as built, no insulation (assumed)",
+ "cavity wall, as built, partial insulation (assumed)",
+ "cavity wall, as built, partial insulation",
+ "cavity wall, as built, no insulation",
+ ]
+
+ # List of strings that we look for in the EPC data, where substrings indicate that the wall is insulated
+ EPC_INSULATED_WALLS_SUBSTRINGS = [
+ ", insulated", "with external insulation", "with internal insulation", "filled cavity"
+ ]
+
+ # List of strings that we look for in the EPC data, where substrings indicate that the roof is insulated
+ EPC_INSULATED_ROOF_SUBSTRINGS = [
+ "(another dwelling above)", ", insulated", ", insulated (assumed) ",
+ ", ceiling insulated",
+ ]
+
+ def __init__(
+ self,
+ local_filepath,
+ sheet_name,
+ address1_colname,
+ postcode_colname,
+ full_address_colname,
+ landlord_property_id=None,
+ full_address_cols_to_concat=None,
+ missing_postcodes_method=None,
+ address1_extraction_method=None,
+ landlord_year_built=None,
+ landlord_uprn=None,
+ landlord_property_type=None,
+ landlord_wall_construction=None,
+ landlord_heating_system=None,
+ landlord_existing_pv=None,
+ header=0
+ ):
+ self.local_filepath = local_filepath
+ self.sheet_name = sheet_name
+ # Read in the data
+ self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name)
+ self.standardised_asset_list = self.raw_asset_list.copy()
+ # Will be used to store aggregated figures against the various work types
+ self.work_type_figures = {}
+ self.work_type_breakdowns = {}
+ self.flat_data = None
+ self.duplicated_addresses = None
+
+ # We detect the presence of the non-intrusive columns
+ self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False
+
+ # Names of columns
+ self.landlord_property_id = landlord_property_id
+ self.address1_colname = address1_colname
+ self.postcode_colname = postcode_colname
+ self.full_address_colname = full_address_colname
+ self.landlord_year_built = landlord_year_built
+ self.landlord_uprn = landlord_uprn
+ self.landlord_property_type = landlord_property_type
+ self.landlord_wall_construction = landlord_wall_construction
+ self.landlord_heating_system = landlord_heating_system
+ self.landlord_existing_pv = landlord_existing_pv
+
+ # parameters for cleaning
+ self.full_address_cols_to_concat = full_address_cols_to_concat
+ self.missing_postcodes_method = missing_postcodes_method
+ self.address1_extraction_method = address1_extraction_method
+
+ self.debug_information = {
+ "property_type": None,
+ "wall_construction": None,
+ "heating_system": None,
+ "existing_pv": None
+ }
+
+ self.variable_mappings = {}
+
+ self.rename_map = {}
+ self.keep_variables = []
+
+ # Finally, we handle the case where the landlord's property ID is actually the OS UPRN
+ if (self.landlord_uprn == self.landlord_property_id) and (self.landlord_property_id is not None):
+ self.standardised_asset_list[self.STANDARD_UPRN] = self.standardised_asset_list[self.landlord_uprn].copy()
+ # Update the reference to landlord UPRn
+ self.landlord_uprn = self.STANDARD_UPRN
+
+ def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"):
+
+ if method not in self.ADDRESS_1_CLEANING_METHODS:
+ raise ValueError(f"Method {method} for producing address1 not recognized")
+
+ if method == "first_two_words":
+ asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
+ return asset_list
+
+ if method == "first_word":
+ asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[0]
+ return asset_list
+
+ if method == "house_number_extraction":
+ asset_list[self.address1_colname] = asset_list.apply(
+ lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]),
+ axis=1
+ )
+ return asset_list
+
+ raise ValueError(f"Method {method} not recognized")
+
+ @staticmethod
+ def _address1_extraction(x):
+ pass
+
+ def create_property_id(self):
+ """
+ This function creates the domna property ID, which is simply a hash of the full address and postcode
+ We want all figures to be positive
+ :return:
+ """
+
+ # We'll remove punctuation and whitespace from the address, before hashing to produce an ID
+
+ def _make_hash(value):
+ """Generates a stable SHA256 hash suffix and appends it to a cleaned version of the value."""
+ # Normalize and remove special characters for cleaner ID
+ cleaned_value = re.sub(r"[^\w\s-]", "", value).replace(" ", "_").lower()
+
+ # Generate SHA-256 hash and truncate it
+ short_hash = hashlib.sha256(value.encode()).hexdigest()[:12]
+
+ return f"{cleaned_value}-{short_hash}"
+
+ # Apply transformation
+ self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = (
+ self.standardised_asset_list[self.full_address_colname] +
+ self.standardised_asset_list[self.postcode_colname]
+ ).str.strip().str.replace(r"[^\w\s]", "", regex=True).str.replace(" ", "").str.lower().apply(_make_hash)
+
+ @staticmethod
+ def _strip_postcode_from_full_address(full_address, postcode):
+ cleaned = full_address.replace(postcode, "")
+ # Remove any trailing commas and spaces
+ cleaned = cleaned.rstrip(", ").strip(",").strip()
+ return cleaned
+
+ @classmethod
+ def _identify_multi_address(cls, address):
+ # We check if the address is comma separated
+ if "," in address:
+ address1_section = address.split(",")[0]
+ # We look for string in the form (x-y)
+ return bool(cls.MULTI_UNIT_REGEX.search(address1_section))
+
+ @staticmethod
+ def _convert_uprn(x):
+ """
+ Used to convert UPRNS to integer strings
+ :param x: uprn to convert
+ :return: converted uprn
+ """
+
+ if pd.isnull(x):
+ return x
+
+ # check if numeric
+ if np.isreal(x):
+ return str(int(x))
+
+ if str(x).isdigit():
+ return str(int(x))
+ return x
+
+ def init_standardise(self):
+ """
+ This function is used to standardise the asset list
+ :return: standardised asset list
+ """
+
+ # Remove rows without a postcode
+ if self.postcode_colname is not None:
+ self.standardised_asset_list = self.standardised_asset_list.dropna(subset=[self.postcode_colname])
+
+ # We clean up portential non-breaking spaces, and double spaces
+ for col in [
+ c for c in [self.postcode_colname, self.full_address_colname, self.address1_colname] if
+ c is not None
+ ]:
+ self.standardised_asset_list[col] = self.standardised_asset_list[col].astype(str)
+ self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace('\xa0', ' ', regex=False)
+ self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace(' ', ' ', regex=False)
+
+ if self.address1_colname is None:
+ if self.address1_extraction_method is None:
+ raise ValueError("Missing address 1 - please specify an extraction method")
+ self.address1_colname = self.STANDARD_ADDRESS_1
+ # If we do not have this, we produce it
+ self.standardised_asset_list = self._extract_address1(
+ asset_list=self.standardised_asset_list,
+ full_address_col=self.full_address_colname,
+ postcode_col=self.postcode_colname,
+ method=self.address1_extraction_method
+ )
+
+ if self.full_address_colname is None:
+ if not self.full_address_cols_to_concat:
+ raise ValueError("Missing full address - please specify columns to concatenate")
+ self.full_address_colname = self.STANDARD_FULL_ADDRESS
+ self.standardised_asset_list[self.full_address_colname] = (
+ self.standardised_asset_list[self.full_address_cols_to_concat].apply(lambda x: ", ".join(x), axis=1)
+ )
+ else:
+
+ # Make sure to strip the postcode out of the full address
+ self.standardised_asset_list[self.full_address_colname] = self.standardised_asset_list.apply(
+ lambda x: self._strip_postcode_from_full_address(
+ full_address=x[self.full_address_colname],
+ postcode=x[self.postcode_colname]
+ ),
+ axis=1
+ )
+
+ # We create the domna property id
+ self.create_property_id()
+
+ # Clean up the UPRN column, if the landlord has provided them
+ if self.landlord_uprn is not None:
+ self.standardised_asset_list[self.landlord_uprn] = (
+ self.standardised_asset_list[self.landlord_uprn].apply(self._convert_uprn)
+ )
+
+ # We keep just the columns we care about and will work through the various columns and standardise
+ variables = [
+ self.landlord_property_id,
+ self.DOMNA_PROPERTY_ID,
+ self.address1_colname,
+ self.postcode_colname,
+ self.full_address_colname,
+ self.landlord_uprn,
+ self.landlord_property_type,
+ self.landlord_year_built,
+ self.landlord_wall_construction,
+ self.landlord_heating_system,
+ self.landlord_existing_pv
+ ]
+ # Keep just non-null variables (e.g landlord may not provide uprn
+ self.keep_variables = [v for v in variables if v is not None]
+ self.rename_map = {
+ self.landlord_property_id: self.STANDARD_LANDLORD_PROPERTY_ID,
+ self.address1_colname: self.STANDARD_ADDRESS_1,
+ self.postcode_colname: self.STANDARD_POSTCODE,
+ self.full_address_colname: self.STANDARD_FULL_ADDRESS,
+ self.landlord_uprn: self.STANDARD_UPRN,
+ self.landlord_property_type: self.STANDARD_PROPERTY_TYPE,
+ self.landlord_year_built: self.STANDARD_YEAR_BUILT,
+ self.landlord_wall_construction: self.STANDARD_WALL_CONSTRUCTION,
+ self.landlord_heating_system: self.STANDARD_HEATING_SYSTEM,
+ self.landlord_existing_pv: self.STANDARD_EXISTING_PV
+ }
+ self.rename_map = {k: v for k, v in self.rename_map.items() if k is not None}
+
+ if self.non_intrusives_present:
+ self.keep_variables += self.NON_INTRUSIVES_COLNAMES
+ self.rename_map = {
+ **self.rename_map,
+ **dict(
+ zip(self.NON_INTRUSIVES_COLNAMES, ["non-intrusives: " + c for c in self.NON_INTRUSIVES_COLNAMES])
+ )
+ }
+
+ # We idenfiy addresses which are likely to be multi-addresses (i.g are rooms x-y)
+ self.standardised_asset_list["is_multi_address"] = self.standardised_asset_list[
+ self.full_address_colname
+ ].apply(lambda x: self._identify_multi_address(x))
+
+ # We handle cleaning for walls, in the instance that the landlord provides us with EPC data and
+ # we see instances of "average thermal transmittance" in the description
+ self.standardised_asset_list[self.landlord_wall_construction] = np.where(
+ self.standardised_asset_list[self.landlord_wall_construction].str.lower().str.contains(
+ "average thermal transmittance"
+ ) == True,
+ "new build - average thermal transmittance",
+ self.standardised_asset_list[self.landlord_wall_construction]
+ )
+
+ # Clear our build year column
+ # We attempt to process the year built column
+ if self.landlord_year_built is not None:
+ # We check if we have a datetime - year built has not been renamed
+ if isinstance(self.standardised_asset_list[self.landlord_year_built].iloc[0], datetime):
+ # We treat any string columns - with common values we see
+ self.standardised_asset_list[self.landlord_year_built] = (
+ self.standardised_asset_list[self.landlord_year_built].replace(self.DATETIME_REMAP)
+ )
+
+ self.standardised_asset_list[self.landlord_year_built] = pd.to_datetime(
+ self.standardised_asset_list[self.landlord_year_built]
+ )
+ # Convert this to year
+ self.standardised_asset_list[self.landlord_year_built] = (
+ self.standardised_asset_list[self.landlord_year_built].dt.year
+ )
+ else:
+ # We attempt to convert the year built to a datetime, by detecting the format and converting
+
+ def extract_year(date_str):
+ """
+ Extracts the year from a date string in the format '01-Jul-YYYY'.
+ Returns the extracted year as an integer or None if the format is incorrect.
+ """
+ known_errors = ["#MULTIVALUE"]
+
+ if pd.isnull(date_str) or date_str in known_errors:
+ return None
+
+ if isinstance(date_str, str):
+ match = re.match(r"\d{1,2}-[A-Za-z]{3}-(\d{4})", date_str)
+ if match:
+ return int(match.group(1)) # Extract the year and convert to integer
+
+ if isinstance(date_str, datetime):
+ return date_str.year
+
+ # Check if date_str is a year itself
+ if str(date_str).isdigit() & (len(str(date_str)) == 4):
+ return int(date_str)
+
+ raise NotImplementedError("Unhandled format for year built - implement me")
+
+ self.standardised_asset_list[self.landlord_year_built] = self.standardised_asset_list[
+ self.landlord_year_built
+ ].apply(extract_year)
+
+ # We now create standard lookups
+ to_remap = {
+ self.landlord_property_type: {
+ "standard_values": property_type_mappings.STANDARD_PROPERTY_TYPES,
+ "standard_map": property_type_mappings.PROPERTY_MAPPING
+ },
+ self.landlord_wall_construction: {
+ "standard_values": walls_mappings.STANDARD_WALL_CONSTRUCTIONS,
+ "standard_map": walls_mappings.WALL_CONSTRUCTION_MAPPINGS
+ },
+ self.landlord_heating_system: {
+ "standard_values": heating_mappings.STANDARD_HEATING_SYSTEMS,
+ "standard_map": heating_mappings.HEATING_MAPPINGS
+ },
+ self.landlord_existing_pv: {
+ "standard_values": existing_pv_mappings.STANDARD_EXISTING_PV,
+ "standard_map": existing_pv_mappings.EXISTING_PV_MAPPINGS
+ }
+ }
+ # Keep just entries where the key is not None
+ to_remap = {k: v for k, v in to_remap.items() if k is not None}
+
+ for variable, config in to_remap.items():
+ logger.info("Standardising variable: %s", variable)
+ values_to_remap = self.standardised_asset_list[variable].unique()
+ # We want to map this to our standardised list of property types we're interested in
+ remapper = DataRemapper(standard_values=config["standard_values"], standard_map=config["standard_map"])
+ remap_dictionary = remapper.standardize_list(values_to_remap=values_to_remap.tolist())
+ self.variable_mappings[variable] = remap_dictionary
+
+ # We now print out the variable mappings, which can be reviewed by the user, before the final standardised
+ # asset list is returned
+ for variable, mapping in self.variable_mappings.items():
+ pprint(f"Variable: {variable}")
+ pprint(mapping)
+ # Print a space
+ print("\n")
+ pprint("=======================================")
+
+ def apply_standardiation(self, override_empty_mappings=False):
+ """
+ This function applies the standardisation to the asset list
+ :param override_empty_mappings: If true, will override the check for empty mappings. This is only relevant
+ if there are no categories which need remapping which is highly unlikely
+ :return:
+ """
+ if not self.variable_mappings and not override_empty_mappings:
+ raise ValueError("Please run init_standardise first")
+
+ logger.info("Applying standardisation to asset list")
+
+ for variable, mapping in self.variable_mappings.items():
+ self.standardised_asset_list[variable + "_original_from_landlord"] = (
+ self.standardised_asset_list[variable].copy()
+ )
+ self.standardised_asset_list[variable] = self.standardised_asset_list[variable].map(mapping)
+
+ if self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum():
+ # Drop the dupes
+ pprint(
+ f"There are {self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum()} duplicated "
+ f"addresses - dropping"
+ )
+
+ # Keep a record of duplicates
+ self.duplicated_addresses = self.standardised_asset_list[
+ self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated()
+ ][[self.DOMNA_PROPERTY_ID, self.address1_colname, self.postcode_colname]].copy()
+
+ self.standardised_asset_list = self.standardised_asset_list[
+ ~self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated()
+ ]
+
+ # Apply renames to our standard names
+ # Perform final variable selection and renaming:
+
+ # We add the original columns to the keep variables
+ self.keep_variables += [
+ k + "_original_from_landlord" for k in self.variable_mappings.keys()
+ ]
+
+ self.standardised_asset_list = self.standardised_asset_list[self.keep_variables].rename(
+ columns=self.rename_map
+ )
+
+ # We fill any standard columns that are not in the data because they were not provided by the landlord
+ missing_variables = [
+ v for v in [
+ self.STANDARD_EXISTING_PV,
+ self.STANDARD_HEATING_SYSTEM,
+ self.STANDARD_UPRN,
+ self.STANDARD_PROPERTY_TYPE,
+ self.STANDARD_YEAR_BUILT,
+ self.STANDARD_WALL_CONSTRUCTION,
+ self.STANDARD_HEATING_SYSTEM,
+ self.STANDARD_EXISTING_PV
+ ] if v not in self.standardised_asset_list.columns
+ ]
+ for v in missing_variables:
+ self.standardised_asset_list[v] = None
+
+ def merge_data(self, df: pd.DataFrame):
+ """
+ Used to insert data into the standardised asset list, based on the domna property id
+ :return:
+ """
+ if self.DOMNA_PROPERTY_ID not in df.columns:
+ raise ValueError(f"Dataframe must contain the column {self.DOMNA_PROPERTY_ID}")
+
+ if df[self.DOMNA_PROPERTY_ID].duplicated().sum():
+ raise ValueError(f"{self.DOMNA_PROPERTY_ID} contains duplicated IDs")
+
+ self.standardised_asset_list = self.standardised_asset_list.merge(
+ df, how="left", on=self.DOMNA_PROPERTY_ID
+ )
+
+ def extract_attributes(self):
+ # Used to extracty the typical attributes that we use to identify viable work
+
+ self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR] = (
+ self.standardised_asset_list[self.FIND_EPC_DATA_NAMES["Solar photovoltaics"]] |
+ ~self.standardised_asset_list[self.EPC_API_DATA_NAMES["photo-supply"]].isin(["0.0", 0, None, ""])
+ )
+
+ accepted_epc_property_types = ["House", "Flat", "Bungalow", "Maisonette"]
+
+ # The logic here is:
+ # 1) Take the property type provided by the HA themselves
+ # 2) In absence of that, take the EPC property type
+ # 3) Otherwise use None
+ self.standardised_asset_list[self.ATTRIBUTE_NUMBER_OF_FLOORS] = self.standardised_asset_list.apply(
+ lambda x: estimate_number_of_floors(
+ property_type=(
+ x[self.STANDARD_PROPERTY_TYPE].title() if
+ x[self.STANDARD_PROPERTY_TYPE].title() in accepted_epc_property_types else (
+ x[self.EPC_API_DATA_NAMES["property-type"]] if not
+ pd.isnull(x[self.EPC_API_DATA_NAMES["property-type"]]) else None
+ )
+ )
+ ),
+ axis=1
+ )
+
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["total-floor-area"]] = (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["total-floor-area"]].astype(float)
+ )
+ # Replace "" value with None
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] = (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]].replace("", None)
+ )
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] = (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]].astype(float)
+ )
+
+ # Estimate the perimeter
+ self.standardised_asset_list[self.ATTRIBUTE_ESTIMATED_PERIMETER] = self.standardised_asset_list.apply(
+ lambda x: estimate_perimeter(
+ floor_area=x[self.EPC_API_DATA_NAMES["total-floor-area"]] / x[self.ATTRIBUTE_NUMBER_OF_FLOORS],
+ num_rooms=x[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] / x[self.ATTRIBUTE_NUMBER_OF_FLOORS],
+ ), axis=1
+ )
+
+ self.standardised_asset_list[self.ATTRIBUTE_HEAT_LOSS_AREA] = self.standardised_asset_list.apply(
+ lambda x: estimate_external_wall_area(
+ num_floors=x[self.ATTRIBUTE_NUMBER_OF_FLOORS],
+ floor_height=(
+ float(x[self.EPC_API_DATA_NAMES["floor-height"]]) if
+ x[self.EPC_API_DATA_NAMES["floor-height"]] else 2.5
+ ),
+ perimeter=x[self.ATTRIBUTE_ESTIMATED_PERIMETER],
+ built_form=x[self.EPC_API_DATA_NAMES["built-form"]]
+ ),
+ axis=1
+ )
+
+ self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = self.standardised_asset_list.apply(
+ lambda x: RoofAttributes(description=x[self.EPC_API_DATA_NAMES["roof-description"]]).process()[
+ "insulation_thickness"] if not pd.isnull(
+ x[self.EPC_API_DATA_NAMES["roof-description"]]) else None,
+ axis=1
+ )
+
+ # We produce some additional fields
+ # 1) Is the SAP rating below C75
+ self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] = (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]].astype(float) <=
+ self.FILLED_CAVITY_SAP_THRESHOLD
+ )
+ # 2) Flag anything where the EPC is older than 5 years
+ self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] = (
+ pd.to_datetime(
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["inspection-date"]]
+ ).dt.year < self.EPC_YEAR_THRESHOLD
+ )
+
+ self.process_age_band()
+
+ def process_age_band(self):
+ processed_age_band = []
+ for _, x in self.standardised_asset_list.iterrows():
+
+ if pd.isnull(x[self.EPC_API_DATA_NAMES["construction-age-band"]]) or (
+ x[self.EPC_API_DATA_NAMES["construction-age-band"]] in Definitions.DATA_ANOMALY_MATCHES
+ ):
+ processed_age_band.append(
+ {
+ self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
+ "epc_year_lower_bound": None,
+ "epc_year_upper_bound": None,
+ "does_age_band_match_epc_age_band": "No EPC Age Band"
+ }
+ )
+ continue
+
+ # We exatract the upper and lower bounds
+ if x[self.EPC_API_DATA_NAMES["construction-age-band"]] in [
+ "England and Wales: 2007 onwards", "England and Wales: 2012 onwards"
+ ]:
+ year_lower_bound = 2007 if x[self.EPC_API_DATA_NAMES[
+ "construction-age-band"]] == "England and Wales: 2007 onwards" else 2012
+
+ if pd.isnull(x[self.STANDARD_YEAR_BUILT]):
+ age_band_matches = "No Year Built From Landlord"
+ else:
+ age_band_matches = (
+ "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] >= year_lower_bound
+ else "EPC Age Band is older than Year Built"
+ )
+
+ processed_age_band.append(
+ {
+ self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
+ "epc_year_lower_bound": year_lower_bound,
+ "epc_year_upper_bound": None,
+ "does_age_band_match_epc_age_band": age_band_matches
+ }
+ )
+ continue
+
+ if x[self.EPC_API_DATA_NAMES["construction-age-band"]] == "England and Wales: before 1900":
+
+ if pd.isnull(x[self.STANDARD_YEAR_BUILT]):
+ age_band_matches = "No Year Built From Landlord"
+ else:
+ age_band_matches = (
+ "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] < 1900
+ else "EPC Age Band is newer than Year Built"
+ )
+
+ processed_age_band.append(
+ {
+ self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
+ "epc_year_lower_bound": None,
+ "epc_year_upper_bound": 1899,
+ "does_age_band_match_epc_age_band": age_band_matches
+ }
+ )
+ continue
+
+ if x[self.EPC_API_DATA_NAMES["construction-age-band"]].isdigit():
+
+ if pd.isnull(x[self.STANDARD_YEAR_BUILT]):
+ age_band_matches = "No Year Built From Landlord"
+ else:
+ age_band_matches = (
+ "EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] == int(
+ x[self.EPC_API_DATA_NAMES["construction-age-band"]]
+ )
+ else "EPC Age Band is different from Year Built"
+ )
+
+ processed_age_band.append(
+ {
+ self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
+ "epc_year_lower_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]),
+ "epc_year_upper_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]),
+ "does_age_band_match_epc_age_band": age_band_matches
+ }
+ )
+ continue
+
+ # Oherwise, we extract the upper and lower bounds
+ age_band = x[self.EPC_API_DATA_NAMES["construction-age-band"]].split(": ")[1]
+ lower_date, upper_date = age_band.split("-")
+
+ age_band_matches = (
+ "EPC Age Band Matches Year Built" if (x[self.STANDARD_YEAR_BUILT] >= float(lower_date)) and (
+ x[self.STANDARD_YEAR_BUILT] <= float(upper_date)
+ )
+ else "EPC Age Band is older than Year Built" if x[self.STANDARD_YEAR_BUILT] > float(upper_date)
+ else "EPC Age Band is newer than Year Built"
+ )
+
+ processed_age_band.append(
+ {
+ self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
+ "epc_year_lower_bound": int(lower_date),
+ "epc_year_upper_bound": int(upper_date),
+ "does_age_band_match_epc_age_band": age_band_matches
+ }
+ )
+
+ processed_age_band = pd.DataFrame(processed_age_band)
+
+ self.standardised_asset_list = self.standardised_asset_list.merge(
+ processed_age_band, how="left"
+ )
+
+ def identify_worktypes(self, cleaned):
+
+ if not self.non_intrusives_present:
+ raise NotImplementedError("Need to implement the case for non-intrusives")
+
+ # If we have non-intrusives completed, we can use this to identify work types
+
+ if self.non_intrusives_present:
+ ######################################################
+ # Empty cavity:
+ ######################################################
+ # 1) Has been flagged on the non-intrusives as being a cavity wall, empty or partially filled
+ # 2) The age is before 1995
+ # 3) We don't remove anything that haas access issues yet
+ self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = (
+ (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) &
+ (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") &
+ self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) &
+ (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002) &
+ (
+ self.standardised_asset_list[
+ self.EPC_API_DATA_NAMES["current-energy-efficiency"]
+ ] <= self.EMPTY_CAVITY_SAP_THRESHOLD
+ )
+ )
+ # Let's also flag work that looks eligible without the SAP filter
+ self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] = (
+ (~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) &
+ (self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") &
+ self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) &
+ (self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2002)
+ )
+
+ # If non_intrusive_indicates_empty_cavity is True,
+ # set non_intrusive_indicates_empty_cavity_no_sap_filter to False
+ self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"] = np.where(
+ self.standardised_asset_list["non_intrusive_indicates_empty_cavity"],
+ False,
+ self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"]
+ )
+
+ self.standardised_asset_list["epc_indicates_empty_cavity"] = (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin(
+ self.EPC_NO_WALL_INSULATION_DESCRIPTIONS
+ ) & (
+ self.standardised_asset_list["epc_year_upper_bound"] <= 1995
+ ) & (
+ ~self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD]
+ ) & (
+ self.standardised_asset_list[
+ self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= self.EMPTY_CAVITY_SAP_THRESHOLD
+ )
+ )
+
+ # If the EPC is esimtated, we defer to the non-intrusives
+ self.standardised_asset_list["epc_indicates_empty_cavity"] = np.where(
+ (
+ self.standardised_asset_list["epc_indicates_empty_cavity"] &
+ ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] &
+ self.standardised_asset_list["estimated"]
+ ),
+ False,
+ self.standardised_asset_list["epc_indicates_empty_cavity"]
+ )
+
+ ######################################################
+ # Extraction
+ ######################################################
+
+ # as needing a CIGA check. What is the logic we should be applying here?
+ self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = (
+ (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") &
+ (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) &
+ (~self.standardised_asset_list['non-intrusives: Material'].isin(
+ ["GREY LOOSE BEAD", "COMPACTED BEAD", "FIBRE BATT NO CAVITY", "EMPTY NARROW BELOW 30mm"]
+ )
+ ) & (
+ self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW]
+ )
+ )
+
+ # Also include work without the SAP filter as optimistic
+ self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = (
+ (self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") &
+ (self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) &
+ (~self.standardised_asset_list['non-intrusives: Material'].isin(
+ ["GREY LOOSE BEAD", "COMPACTED BEAD", "FIBRE BATT NO CAVITY", "EMPTY NARROW BELOW 30mm"]
+ )
+ )
+ )
+
+ # Adjust
+ self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] = np.where(
+ self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"],
+ False,
+ self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"]
+ )
+
+ ######################################################
+ # Solar
+ ######################################################
+ # Criteria:
+ # Check 1: Does the property have a valid heating system?
+ self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] = (
+ self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin(
+ ["air source heat pump", "ground source heat pump", "high heat retention storage heaters"]
+ )
+ )
+
+ self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] = (
+ (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]]
+ .str.lower().str.contains("air source heat pump|ground source heat pump")
+ ) | (
+ self.standardised_asset_list[
+ self.EPC_API_DATA_NAMES["mainheat-description"]].str.lower().str.contains(
+ "electric storage heaters"
+ ) & (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES[
+ "mainheatcont-description"]] == "Controls for high heat retention storage heaters"
+ )
+ )
+ )
+
+ # Check 2: Does the property have solar already
+ self.standardised_asset_list["property_has_solar"] = (
+ (self.standardised_asset_list[self.STANDARD_EXISTING_PV] == "already has PV") |
+ (self.standardised_asset_list["non-intrusives: PV, ACCESS ISSUE, SEE NOTES"] == "SOLAR PV ON ROOF") |
+ (self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR])
+ )
+
+ # Check 3: Does the property meet the fabric condition
+ # Solar PV installs are subject to the minimum insulation requirements which means:
+ # 1) one of the following insulation measures must be installed as part of the same
+ # ECO4 project:
+ # • roof insulation (flat roof, pitched roof, room-in-roof)
+ # • exterior facing wall insulation (cavity wall, solid wall)
+ # • party cavity wall insulation
+ # • floor insulation (solid and underfloor)
+ #
+ # OR
+ #
+ # all measures (except any exempted measure referred to in paragraph 4.28)
+ # listed in paragraph a) must already be installed
+ #
+ # With this in mind, we look for 2 clases
+ # 1) The property is fully insulated apart from the loft (<200mm insulation)
+ # 2) THe property is fully insulated
+
+ self.standardised_asset_list["solar_landlord_walls_insulated"] = (
+ self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin(
+ ["filled cavity", "insulated solid brick"]
+ )
+ )
+
+ self.standardised_asset_list["solar_non_intrusives_walls_insulated"] = (
+ self.standardised_asset_list["non-intrusives: Insulated"].isin(
+ ["EWI", "RETRO DRILLED", "FILLED AT BUILD"]
+ )
+ )
+
+ # TODO: We don't have information about the roof from this landlord
+
+ # We merge on the u-value for average thermal transmittance
+ walls_uvalue_data = pd.DataFrame(cleaned["walls-description"])
+ walls_uvalue_data = walls_uvalue_data[
+ ~pd.isnull(walls_uvalue_data["thermal_transmittance"])
+ ][["original_description", "thermal_transmittance"]].rename(
+ columns={
+ "original_description": self.EPC_API_DATA_NAMES["walls-description"],
+ "thermal_transmittance": "walls_u_value"
+ }
+ )
+ self.standardised_asset_list = self.standardised_asset_list.merge(
+ walls_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["walls-description"]
+ )
+
+ self.standardised_asset_list["solar_epc_walls_insulated"] = (
+ (
+ self.standardised_asset_list[
+ self.EPC_API_DATA_NAMES[
+ "walls-description"]].str.lower().str.contains(
+ "|".join(
+ self.EPC_INSULATED_WALLS_SUBSTRINGS)
+ )
+ ) | (
+ self.standardised_asset_list[
+ "walls_u_value"].apply(
+ lambda x: x <= 0.7 if not pd.isnull(x) else False
+ )
+ )
+ )
+
+ # We merge on the u-value for average thermal transmittance
+ roof_uvalue_data = pd.DataFrame(cleaned["roof-description"])
+ roof_uvalue_data = roof_uvalue_data[
+ ~pd.isnull(roof_uvalue_data["thermal_transmittance"])
+ ][["original_description", "thermal_transmittance"]].rename(
+ columns={
+ "original_description": self.EPC_API_DATA_NAMES["roof-description"],
+ "thermal_transmittance": "roof_u_value"
+ }
+ )
+
+ self.standardised_asset_list = self.standardised_asset_list.merge(
+ roof_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["roof-description"]
+ )
+
+ # If the u-value of a roof is less than 0.7 we consider it insulated
+ self.standardised_asset_list["solar_epc_roof_insulated"] = (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["roof-description"]].str.lower().str.contains(
+ "|".join(self.EPC_INSULATED_ROOF_SUBSTRINGS), regex=False
+ ) | (
+ self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply(
+ lambda x: int(x) >= 200 if str(x).isdigit() else False
+ )
+ ) | (
+ self.standardised_asset_list["roof_u_value"].apply(
+ lambda x: x <= 0.7 if not pd.isnull(x) else False
+ )
+ )
+ )
+
+ self.standardised_asset_list["solar_epc_loft_needs_topup"] = self.standardised_asset_list[
+ self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply(
+ lambda x: int(x) < 200 if str(x).isdigit() else False
+ )
+
+ # TODO: Fill with False - should be temp!
+ self.standardised_asset_list["epc_has_floor_recommendation"] = (
+ self.standardised_asset_list["epc_has_floor_recommendation"].fillna(False)
+ )
+
+ # We merge on the u-value for average thermal transmittance
+ floors_uvalue_data = pd.DataFrame(cleaned["floor-description"])
+ floors_uvalue_data = floors_uvalue_data[
+ ~pd.isnull(floors_uvalue_data["thermal_transmittance"])
+ ][["original_description", "thermal_transmittance"]].rename(
+ columns={
+ "original_description": self.EPC_API_DATA_NAMES["floor-description"],
+ "thermal_transmittance": "floor_u_value"
+ }
+ )
+
+ # Merge on
+ self.standardised_asset_list = self.standardised_asset_list.merge(
+ floors_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["floor-description"]
+ )
+
+ # We assume that a U-value of 0.5 or below is indicative of an insulated floor
+ self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"] = (
+ (
+ (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str
+ .lower().str.contains("solid")
+ ) & (
+ ~self.standardised_asset_list["epc_has_floor_recommendation"]
+ ) & (
+ # We do not utilise estimated EPCs for this method because we will always find that
+ # "epc_has_floor_recommendation" is False
+ (self.standardised_asset_list["estimated"] == False)
+ )
+ ) | (
+ (
+ self.standardised_asset_list[
+ self.EPC_API_DATA_NAMES["floor-description"]].str.lower().str.contains("solid")
+ ) & (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str.lower()
+ .str.contains(", insulated")
+ )
+ )
+ )
+
+ # Check for other floor types, insulated
+ self.standardised_asset_list["solar_epc_floor_is_other_insulated"] = (
+ # The floor is suspended and insulated
+ (
+ (
+ self.standardised_asset_list[self.EPC_API_DATA_NAMES["floor-description"]].str
+ .lower().str.contains("suspended")
+ ) & (
+ ~self.standardised_asset_list["epc_has_floor_recommendation"]
+ ) & (
+ # We do not utilise estimated EPCs for this method because we will always find that
+ # "epc_has_floor_recommendation" is False
+ self.standardised_asset_list["estimated"] == False
+ )
+ ) | (
+ (
+ self.standardised_asset_list[
+ self.EPC_API_DATA_NAMES["floor-description"]
+ ].str.lower().str.contains("suspended")
+ ) & (
+ self.standardised_asset_list[
+ self.EPC_API_DATA_NAMES["floor-description"]
+ ].str.lower().str.contains(", insulated")
+ )
+ ) | (
+ self.standardised_asset_list["floor_u_value"].apply(
+ lambda x: x <= 0.5 if not pd.isnull(x) else False
+ )
+ )
+ )
+
+ # We now put together the criteria:
+ # Flag properties that look eligible for solar, that have solid floors
+ # TODO: We'll need to revise this
+ self.standardised_asset_list["solar_eligible_solid_floor"] = (
+ # Landlord data or EPC data indicates the heating system is appropriate
+ (
+ self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] |
+ self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"]
+ ) &
+ # The property doesn't currently have solar
+ ~self.standardised_asset_list["property_has_solar"] &
+ # The walls are insulated
+ (
+ self.standardised_asset_list["solar_landlord_walls_insulated"] |
+ self.standardised_asset_list["solar_epc_walls_insulated"] |
+ self.standardised_asset_list["solar_non_intrusives_walls_insulated"]
+ ) &
+ # Roof is insulated
+ self.standardised_asset_list["solar_epc_roof_insulated"] &
+ self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"]
+ )
+
+ # Solid floor but needs a loft top-up
+ self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"] = (
+ # Landlord data or EPC data indicates the heating system is appropriate
+ (
+ self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] |
+ self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"]
+ ) &
+ # The property doesn't currently have solar
+ ~self.standardised_asset_list["property_has_solar"] &
+ # The walls are insulated
+ (
+ self.standardised_asset_list["solar_landlord_walls_insulated"] |
+ self.standardised_asset_list["solar_epc_walls_insulated"] |
+ self.standardised_asset_list["solar_non_intrusives_walls_insulated"]
+ ) &
+ # Roof is insulated
+ self.standardised_asset_list["solar_epc_loft_needs_topup"] &
+ self.standardised_asset_list["solar_epc_floor_is_solid_no_recommendation"]
+ )
+
+ # Other floor type, fully insulated
+
+ self.standardised_asset_list["solar_eligible_other_floor"] = (
+ # Landlord data or EPC data indicates the heating system is appropriate
+ (
+ self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] |
+ self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"]
+ ) &
+ # The property doesn't currently have solar
+ ~self.standardised_asset_list["property_has_solar"] &
+ # The walls are insulated
+ (
+ self.standardised_asset_list["solar_landlord_walls_insulated"] |
+ self.standardised_asset_list["solar_epc_walls_insulated"]
+ ) &
+ # Roof is insulated
+ self.standardised_asset_list["solar_epc_roof_insulated"] &
+ self.standardised_asset_list["solar_epc_floor_is_other_insulated"]
+ )
+
+ # Other floor type, needs loft top-up
+ self.standardised_asset_list["solar_eligible_other_floor_needs_loft"] = (
+ # Landlord data or EPC data indicates the heating system is appropriate
+ (
+ self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] |
+ self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"]
+ ) &
+ # The property doesn't currently have solar
+ ~self.standardised_asset_list["property_has_solar"] &
+ # The walls are insulated
+ (
+ self.standardised_asset_list["solar_landlord_walls_insulated"] |
+ self.standardised_asset_list["solar_epc_walls_insulated"]
+ ) &
+ # Roof need loft top-up
+ self.standardised_asset_list["solar_epc_loft_needs_topup"] &
+ # Floor is not solid, but is insulated
+ self.standardised_asset_list["solar_epc_floor_is_other_insulated"]
+ )
+
+ # Drop anything we don't need
+ self.standardised_asset_list = self.standardised_asset_list.drop(
+ columns=["walls_u_value", "roof_u_value", "floor_u_value"]
+ )
+
+ # Adjust flagged extraction jobs to remove anything for solar
+ self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = (
+ self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] &
+ ~self.standardised_asset_list["solar_eligible_solid_floor"] &
+ ~self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"]
+ # ~self.standardised_asset_list["solar_eligible_other_floor"] &
+ # ~self.standardised_asset_list["solar_eligible_other_floor_needs_loft"]
+ )
+
+ blocks_of_flats = self.standardised_asset_list[
+ self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] == "block of flats"
+ ]
+
+ non_blocks_of_flats = self.standardised_asset_list[
+ self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE] != "block of flats"
+ ]
+
+ # Produce some aggregate figures
+ self.work_type_figures = {
+ # Empty cavity from non-intrusives
+ "Empty Cavity (non-intrusives)": non_blocks_of_flats["non_intrusive_indicates_empty_cavity"].sum(),
+ "Empty Cavity (non-intrusives, blocks of flats)": (
+ blocks_of_flats["non_intrusive_indicates_empty_cavity"].sum()
+ ),
+ "Empty Cavity (non-intrusives, no SAP filter)": (
+ non_blocks_of_flats["non_intrusive_indicates_empty_cavity_no_sap_filter"].sum()
+ ),
+ "Empty Cavity (non-intrusives, no SAP filter, blocks of flats)": (
+ blocks_of_flats["non_intrusive_indicates_empty_cavity_no_sap_filter"].sum()
+ ),
+ "Empty Cavity (EPC)": (
+ (
+ non_blocks_of_flats["epc_indicates_empty_cavity"] &
+ ~non_blocks_of_flats["non_intrusive_indicates_empty_cavity"]
+ ).sum()
+ ),
+ "Empty Cavity (EPC, blocks of flat)": (
+ (
+ blocks_of_flats["epc_indicates_empty_cavity"] &
+ ~blocks_of_flats["non_intrusive_indicates_empty_cavity"]
+ ).sum()
+ ),
+ "Cavity Extraction": (
+ (
+ ~non_blocks_of_flats["non_intrusive_indicates_empty_cavity"] &
+ ~non_blocks_of_flats["epc_indicates_empty_cavity"] &
+ non_blocks_of_flats["non_intrusive_indicates_cavity_extraction"]
+ ).sum()
+ ),
+ "Cavity Extraction (blocks of flats)": (
+ (
+ ~blocks_of_flats["non_intrusive_indicates_empty_cavity"] &
+ ~blocks_of_flats["epc_indicates_empty_cavity"] &
+ blocks_of_flats["non_intrusive_indicates_cavity_extraction"]
+ ).sum()
+ ),
+ "Cavity Extraction (no SAP filter)": (
+ (
+ ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] &
+ ~self.standardised_asset_list["epc_indicates_empty_cavity"] &
+ self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"]
+ ).sum()
+ ),
+ "Solar PV (Solid Floor)": (
+ self.standardised_asset_list["solar_eligible_solid_floor"].sum()
+ ),
+ "Solar PV (Solid Floor, Needs Loft Top-up)": (
+ self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"].sum()
+ ),
+ "Solar PV (Other Floor)": (
+ self.standardised_asset_list["solar_eligible_other_floor"].sum()
+ ),
+ "Solar PV (Other Floor, Needs Loft Top-up)": (
+ self.standardised_asset_list["solar_eligible_other_floor_needs_loft"].sum()
+ )
+ }
+
+ # We produce a breakdown of the property types, for cavity fills
+ cavity_fills = self.standardised_asset_list[
+ self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] | (
+ self.standardised_asset_list["epc_indicates_empty_cavity"]
+ )
+ ]
+
+ self.work_type_breakdowns = {
+ "empty_cavity": cavity_fills[self.STANDARD_PROPERTY_TYPE].value_counts()
+ }
+
+ # Finally, we note why each property has been flagged
+ self.standardised_asset_list["cavity_reason"] = None
+ self.standardised_asset_list["cavity_reason"] = np.where(
+ self.standardised_asset_list["non_intrusive_indicates_empty_cavity"],
+ "Non-Intrusive Data Showed Empty Cavity",
+ self.standardised_asset_list["cavity_reason"]
+ )
+ self.standardised_asset_list["cavity_reason"] = np.where(
+ self.standardised_asset_list["non_intrusive_indicates_empty_cavity_no_sap_filter"],
+ "Non-Intrusive Data Showed Empty Cavity but all SAP scores allowed",
+ self.standardised_asset_list["cavity_reason"]
+ )
+ self.standardised_asset_list["cavity_reason"] = np.where(
+ (
+ self.standardised_asset_list["epc_indicates_empty_cavity"] &
+ ~self.standardised_asset_list["non_intrusive_indicates_empty_cavity"]
+ ),
+ "EPC Data Showed Empty Cavity",
+ self.standardised_asset_list["cavity_reason"]
+ )
+ # Flag extraction
+ self.standardised_asset_list["cavity_reason"] = np.where(
+ (
+ self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] &
+ pd.isnull(self.standardised_asset_list["cavity_reason"])
+ ),
+ "Non-Intrusive Data Showed Cavity Extraction",
+ self.standardised_asset_list["cavity_reason"]
+ )
+ # extraction no sap filter
+ self.standardised_asset_list["cavity_reason"] = np.where(
+ (
+ self.standardised_asset_list["non_intrusive_indicates_cavity_extraction_no_sap_filter"] &
+ pd.isnull(self.standardised_asset_list["cavity_reason"])
+ ),
+ "Non-Intrusive Data Showed Cavity Extraction but all SAP scores allowed",
+ self.standardised_asset_list["cavity_reason"]
+ )
+
+ # Flag solar
+ self.standardised_asset_list["solar_reason"] = None
+ self.standardised_asset_list["solar_reason"] = np.where(
+ self.standardised_asset_list["solar_eligible_solid_floor"],
+ "Solid Floor, Insulated, No Solar",
+ self.standardised_asset_list["solar_reason"]
+ )
+ self.standardised_asset_list["solar_reason"] = np.where(
+ self.standardised_asset_list["solar_eligible_solid_floor_needs_loft"],
+ "Solid Floor, Insulated, Needs Loft",
+ self.standardised_asset_list["solar_reason"]
+ )
+ self.standardised_asset_list["solar_reason"] = np.where(
+ self.standardised_asset_list["solar_eligible_other_floor"],
+ "Other Floor, Insulated, No Solar",
+ self.standardised_asset_list["solar_reason"]
+ )
+ self.standardised_asset_list["solar_reason"] = np.where(
+ self.standardised_asset_list["solar_eligible_other_floor_needs_loft"],
+ "Other Floor, Insulated, Needs Loft",
+ self.standardised_asset_list["solar_reason"]
+ )
+
+ def flat_analysis(self):
+
+ # We need to deduce the building name - we strip out the house number
+
+ # We want to deduce if flats have 50% of the properties below C75
+ # We group by postcode and property type
+ grouped = self.standardised_asset_list.groupby(
+ [self.STANDARD_POSTCODE, self.STANDARD_PROPERTY_TYPE]
+ )
+
+ flat_data = []
+ for _, group in grouped:
+ if "flat" in group[self.STANDARD_PROPERTY_TYPE].values:
+ num_flats = group[self.STANDARD_PROPERTY_TYPE].shape[0]
+ num_below_c75 = group[
+ self.EPC_API_DATA_NAMES["current-energy-efficiency"]
+ ].lt(self.FILLED_CAVITY_SAP_THRESHOLD).sum()
+ # Check if any flats are below C69
+ num_flats_below_c69 = group[
+ self.EPC_API_DATA_NAMES["current-energy-efficiency"]
+ ].lt(69).sum()
+
+ flat_data.append(
+ {
+ "Postcode": group[self.STANDARD_POSTCODE].iloc[0],
+ "Property Type": "Flat",
+ "Number of Flats with EPC": num_flats,
+ "Number of Flats below C75": num_below_c75,
+ "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats),
+ "Number of Flats Below C69": num_flats_below_c69,
+ }
+ )
+
+ flat_data = pd.DataFrame(flat_data)
+
+ self.flat_data = flat_data
diff --git a/asset_list/app.py b/asset_list/app.py
new file mode 100644
index 00000000..84999e93
--- /dev/null
+++ b/asset_list/app.py
@@ -0,0 +1,480 @@
+import os
+import time
+import json
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+from pprint import pprint
+import msgpack
+from utils.s3 import read_from_s3
+from asset_list.AssetList import AssetList
+from asset_list.mappings.property_type import PROPERTY_MAPPING
+from asset_list.mappings.walls import WALL_CONSTRUCTION_MAPPINGS
+from asset_list.mappings.heating_systems import HEATING_MAPPINGS
+from asset_list.mappings.exising_pv import EXISTING_PV_MAPPINGS
+
+from dotenv import load_dotenv
+from backend.SearchEpc import SearchEpc
+from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+
+def get_data(
+ df, manual_uprn_map, epc_api_only=False, row_id_name="row_id"
+):
+ uprn_column = AssetList.STANDARD_UPRN
+ fulladdress_column = AssetList.STANDARD_FULL_ADDRESS
+ address1_column = AssetList.STANDARD_ADDRESS_1
+ postcode_column = AssetList.STANDARD_POSTCODE
+
+ # These re-map the standard property types to forms accepted by the EPC api, so we can predict EPCs
+ property_type_map = {
+ "house": "House",
+ "flat": "Flat",
+ "maisonette": "Maisonette",
+ "bungalow": "Bungalow",
+ "block house": "House",
+ "coach house": "House",
+ "bedsit": "Flat"
+ }
+
+ epc_data = []
+ errors = []
+ no_epc = []
+ for _, home in tqdm(df.iterrows(), total=len(df)):
+ try:
+
+ # If we have a block of flats, we cannot retrieve this data
+ if home[AssetList.STANDARD_PROPERTY_TYPE] == "block of flats":
+ no_epc.append(home[row_id_name])
+ continue
+
+ postcode = home[postcode_column]
+ house_number = str(home[address1_column]).strip()
+ full_address = home[fulladdress_column].strip()
+ house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode)
+ if house_no is None:
+ house_no = house_number
+ uprn = manual_uprn_map.get(full_address, None)
+ if uprn is None and home.get(uprn_column):
+ uprn = home[uprn_column]
+
+ if pd.isnull(uprn):
+ uprn = None
+
+ property_type = property_type_map.get(home[AssetList.STANDARD_PROPERTY_TYPE], None)
+
+ searcher = SearchEpc(
+ address1=str(house_no),
+ postcode=postcode,
+ auth_token=EPC_AUTH_TOKEN,
+ os_api_key="",
+ property_type=None,
+ fast=True,
+ full_address=full_address,
+ max_retries=5,
+ uprn=uprn
+ )
+ # Force the skipping of estimating the EPC
+ searcher.ordnance_survey_client.property_type = None
+ searcher.ordnance_survey_client.built_form = None
+
+ searcher.find_property(skip_os=True)
+
+ # Check if we have a flat or appartment
+ if searcher.newest_epc is None and uprn is None:
+ # Try again:
+ if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None:
+ # Backup
+ add1 = full_address.split(",")
+ if len(add1) > 1:
+ add1 = add1[1].strip()
+ else:
+ # Try splitting on space
+ add1 = full_address.split(" ")[0].strip()
+
+ else:
+ add1 = str(house_number)
+ searcher = SearchEpc(
+ address1=add1,
+ postcode=postcode,
+ auth_token=EPC_AUTH_TOKEN,
+ os_api_key="",
+ property_type=None,
+ fast=True,
+ full_address=full_address,
+ max_retries=5
+ )
+
+ if (
+ "flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in
+ house_number.lower()
+ ):
+ searcher.ordnance_survey_client.property_type = "Flat"
+
+ searcher.find_property(skip_os=True)
+
+ # As a final resort, we estimate the EPC
+ if property_type is not None and searcher.newest_epc is None:
+ searcher.ordnance_survey_client.property_type = property_type
+ searcher.find_property(skip_os=True)
+
+ if searcher.newest_epc is None:
+ no_epc.append(home[row_id_name])
+ continue
+
+ if epc_api_only:
+ epc = {
+ row_id_name: home[row_id_name],
+ **searcher.newest_epc.copy()
+ }
+
+ epc_data.append(epc)
+ continue
+
+ # Look for EPC recommendatons
+ try:
+ property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
+ except:
+ property_recommendations = {"rows": []}
+
+ # Retrieve data from FindMyEPC
+ try:
+ find_epc_searcher = RetrieveFindMyEpc(
+ address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"]
+ )
+ find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+ except ValueError as e:
+ if "No EPC found" in str(e) and "address1" in searcher.newest_epc:
+ try:
+ find_epc_searcher = RetrieveFindMyEpc(
+ address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"]
+ )
+ find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+ except ValueError as e:
+ if "No EPC found" in str(e):
+ find_epc_data = {}
+ else:
+ find_epc_data = {}
+ except Exception as e:
+ raise Exception(f"Error retrieving FindMyEPC data: {e}")
+ time.sleep(np.random.uniform(0.1, 1))
+
+ epc = {
+ row_id_name: home[row_id_name],
+ **searcher.newest_epc.copy(),
+ "recommendations": property_recommendations["rows"],
+ "find_my_epc_data": find_epc_data,
+ }
+
+ epc_data.append(epc)
+ except Exception as e:
+ errors.append(home[row_id_name])
+ time.sleep(5)
+
+ return epc_data, errors, no_epc
+
+
+def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"):
+ if method == "first_two_words":
+ asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
+ return asset_list
+
+ if method == "first_word":
+ asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0]
+ return asset_list
+
+ if method == "house_number_extraction":
+ asset_list["address1_extracted"] = asset_list.apply(
+ lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]),
+ axis=1
+ )
+ return asset_list
+
+ raise ValueError(f"Method {method} not recognized")
+
+
+def app():
+ """
+ This app is EPC pulling data for some properties owned by Livewest
+
+ Data request contents:
+ Date of last EPC
+ Reason for EPC
+ SAP score on register
+ Property Type
+ Property Area
+ Property Age
+ Any Dimensions (HLP,PW,RH)
+ Property Wall Construction
+ Heating Type
+ Secondary Heating
+ Loft Insulation Depth
+
+ Additional if possible:
+ Heat loss calculations
+ EPC recommendations
+ Property UPRN
+ """
+
+ # TODO:
+ # For cavity work:
+ # - Flag any entries that have a different wall type between non-intrusive data against EPC
+ # - Worth double checking entries that have a difference in wall construction
+ # - Look at anything that is flagged as an empty cavity but the EPC data says it’s a filled cavity
+ # - Look at the current EPC scores - Anything that is C75 or above, especially if it’s assumed no insulation
+ # - By postcode, we can try and deduce if all of the addresses are a flats and then estimate if 50% of the flats
+ # are less than C75
+ # - Flag anything pre SAP2012
+ # - Flag anything over 5 years old
+ # - Look at year built vs age band
+ #
+ # For Solar:
+ # - Discount any that have solar PV - based on non-intrusives and from the inspections team
+ # - In the heating, discount anything that isn’t ashp, ghsp, hhrs, electric storage - possibly homes with
+ # electric room heaters but it might need to be an EPC E
+ # - Fabric - check the floor, wall and roof:
+ # - Filled or empty cavity is good
+ # - Insulated solid/timber/system built is good
+ # - SCIS/CEG needs solid floors
+ # - JJC don’t care
+ # - Anything with a loft 200 or below
+ # - Anything C75 and above won’t qualify
+ # - Insulated loft = 200mm
+ # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
+ # - Or the insulation required is loft/cavity (floors should be solid)
+
+ data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
+ data_filename = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
+ sheet_name = "Sheet1"
+ postcode_column = 'Full Address.1'
+ fulladdress_column = "Full Address"
+ address1_column = None
+ address1_method = "first_word"
+ address_cols_to_concat = []
+ missing_postcodes_method = None
+ landlord_year_built = "Build Date"
+ landlord_os_uprn = None
+ landlord_property_type = "Property Type"
+ landlord_wall_construction = "Wallinsul"
+ landlord_heating_system = "HeatSorc"
+ landlord_existing_pv = None
+ landlord_property_id = "Property Reference"
+
+ # For Westward
+ # data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward"
+ # data_filename = "WESTWARD - completed list..xlsx"
+ # sheet_name = "Sheet1"
+ # postcode_column = "WFT EDIT Postcode"
+ # fulladdress_column = "Address"
+ # address1_column = None
+ # address1_method = "house_number_extraction"
+ # address_cols_to_concat = []
+ # missing_postcodes_method = None
+ # landlord_year_built = "Build date"
+ # landlord_os_uprn = "UPRN"
+ # landlord_property_type = "Location type"
+ # landlord_wall_construction = "Wall Construction (EPC)"
+ # landlord_heating_system = "Heat Source"
+ # landlord_existing_pv = "PV (Y/N)"
+ # landlord_property_id = "Place ref"
+
+ # Maps addresses to uprn in problematic cases
+ manual_uprn_map = {}
+
+ asset_list = AssetList(
+ local_filepath=os.path.join(data_folder, data_filename),
+ header=0,
+ sheet_name=sheet_name,
+ address1_colname=address1_column,
+ postcode_colname=postcode_column,
+ landlord_property_id=landlord_property_id,
+ full_address_colname=fulladdress_column,
+ full_address_cols_to_concat=address_cols_to_concat,
+ missing_postcodes_method=missing_postcodes_method,
+ address1_extraction_method=address1_method,
+ landlord_year_built=landlord_year_built,
+ landlord_uprn=landlord_os_uprn,
+ landlord_property_type=landlord_property_type,
+ landlord_wall_construction=landlord_wall_construction,
+ landlord_heating_system=landlord_heating_system,
+ landlord_existing_pv=landlord_existing_pv
+ )
+ asset_list.init_standardise()
+
+ # We produce the new maps, which can be saved for future useage
+
+ new_property_type_map = PROPERTY_MAPPING.copy().update(
+ asset_list.variable_mappings[asset_list.landlord_property_type] if asset_list.landlord_property_type else {}
+ )
+ new_wall_map = WALL_CONSTRUCTION_MAPPINGS.copy().update(
+ asset_list.variable_mappings[asset_list.landlord_wall_construction] if
+ asset_list.landlord_wall_construction else {}
+ )
+ new_heating_map = HEATING_MAPPINGS.copy().update(
+ asset_list.variable_mappings[asset_list.landlord_heating_system] if asset_list.landlord_heating_system else {}
+ )
+ new_existing_pv_map = EXISTING_PV_MAPPINGS.copy().update(
+ asset_list.variable_mappings[asset_list.landlord_existing_pv] if asset_list.landlord_existing_pv else {}
+ )
+
+ asset_list.apply_standardiation()
+
+ ### We retrieve the EPC data
+
+ # We chunk up this data into 5000 rows at a time
+ # Create the chunks directory
+ force_retrieve_data = False
+ skip = None # Used to skip already completed chunks
+ chunk_size = 5000
+ filename = "Chunk {i}.csv"
+ download_folder = os.path.join(data_folder, "Chunks")
+ if not os.path.exists(download_folder):
+ os.makedirs(download_folder)
+
+ chunk_indexes = list(range(0, len(asset_list.standardised_asset_list), chunk_size))
+ downloaded_files = {filename.format(i=i) for i in chunk_indexes}
+
+ # We check if we have files associated to these files already and if we do, and we do not want to force the
+ # fetching of the data, we skip
+ folder_contents = os.listdir(download_folder)
+ if all(x in folder_contents for x in downloaded_files):
+ skip = max(chunk_indexes)
+
+ for i in range(0, len(asset_list.standardised_asset_list), chunk_size):
+ print(f"Processing chunk {i} to {i + chunk_size}")
+ if skip is not None and not force_retrieve_data:
+ if i <= skip:
+ continue
+ chunk = asset_list.standardised_asset_list[i:i + chunk_size]
+ epc_data_chunk, errors_chunk, no_epc_chunk = get_data(
+ df=chunk,
+ row_id_name=asset_list.DOMNA_PROPERTY_ID,
+ manual_uprn_map=manual_uprn_map,
+ )
+
+ # We now retrieve any failed properties
+ chunk_failed = chunk[chunk[asset_list.DOMNA_PROPERTY_ID].isin(errors_chunk)]
+ epc_data_failed, _, _ = get_data(
+ df=chunk_failed,
+ row_id_name=asset_list.DOMNA_PROPERTY_ID,
+ manual_uprn_map=manual_uprn_map,
+ epc_api_only=False
+ )
+
+ epc_data_chunk.extend(epc_data_failed)
+
+ # Append the failed data to the main data
+ # Store the chunk locally as a csv
+ pd.DataFrame(epc_data_chunk).to_csv(os.path.join(data_folder, f"Chunks/Chunk {i}.csv"), index=False)
+ # Store the errors and no-data locally
+ with open(os.path.join(data_folder, f"Chunks/Chunk {i} errors.json"), "w") as f:
+ json.dump(errors_chunk, f)
+
+ with open(os.path.join(data_folder, f"Chunks/Chunk {i} nodata.csv"), "w") as f:
+ json.dump(no_epc_chunk, f)
+
+ # We read in and concatenate the created created chunks
+ # List the contents
+ epc_data = []
+ for file in downloaded_files:
+ csv_data = pd.read_csv(os.path.join(download_folder, file))
+ # We need to convert the recommendations back to a list
+ csv_data["recommendations"] = csv_data["recommendations"].apply(eval)
+ csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval)
+ epc_data.append(csv_data)
+
+ epc_df = pd.concat(epc_data)
+ epc_df["estimated"] = epc_df["estimated"].fillna(False)
+
+ # We expand out the recommendations
+ recommendations_df = epc_df[[asset_list.DOMNA_PROPERTY_ID, "recommendations"]]
+
+ unique_recommendations = set()
+ for _, row in recommendations_df.iterrows():
+ unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]])
+
+ columns = [asset_list.DOMNA_PROPERTY_ID] + list(unique_recommendations)
+ transformed_data = []
+ for _, row in recommendations_df.iterrows():
+ # Initialize a dictionary for this row with False for all recommendations
+ row_data = {col: False for col in columns}
+ row_data[asset_list.DOMNA_PROPERTY_ID] = row[asset_list.DOMNA_PROPERTY_ID]
+
+ # Set True for each recommendation present in this row
+ for rec in row["recommendations"]:
+ recommendation_text = rec["improvement-summary-text"]
+ row_data[recommendation_text] = True
+
+ # Append the row data to transformed_data
+ transformed_data.append(row_data)
+
+ transformed_df = pd.DataFrame(transformed_data)
+ transformed_df = transformed_df[
+ [
+ asset_list.DOMNA_PROPERTY_ID, "Floor insulation (solid floor)",
+ "Floor insulation", "Floor insulation (suspended floor)"
+ ]
+ ]
+
+ transformed_df["epc_has_floor_recommendation"] = (
+ transformed_df["Floor insulation (solid floor)"] | transformed_df["Floor insulation"] |
+ transformed_df["Floor insulation (suspended floor)"]
+ )
+
+ # Get the find my epc data
+ find_my_epc_data = epc_df[[asset_list.DOMNA_PROPERTY_ID, "find_my_epc_data"]].drop(
+ columns=["find_my_epc_data"]).join(
+ pd.json_normalize(epc_df["find_my_epc_data"])
+ )
+ find_my_epc_data = find_my_epc_data.merge(
+ transformed_df[[asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"]],
+ how="left", on=asset_list.DOMNA_PROPERTY_ID
+ )
+
+ # We check if we get the solar pv column:
+ if "Solar photovoltaics" not in find_my_epc_data.columns:
+ find_my_epc_data["Solar photovoltaics"] = False
+
+ # Retrieve just the data we need
+ epc_df = epc_df[
+ [asset_list.DOMNA_PROPERTY_ID] + list(asset_list.EPC_API_DATA_NAMES.keys())
+ ].rename(
+ columns=asset_list.EPC_API_DATA_NAMES
+ )
+
+ epc_df = epc_df.merge(
+ find_my_epc_data[
+ [asset_list.DOMNA_PROPERTY_ID, "epc_has_floor_recommendation"] + list(asset_list.FIND_EPC_DATA_NAMES.keys())
+ ]
+ .rename(columns=asset_list.FIND_EPC_DATA_NAMES),
+ how="left",
+ on=asset_list.DOMNA_PROPERTY_ID
+ )
+
+ asset_list.merge_data(epc_df)
+
+ asset_list.extract_attributes()
+
+ cleaned = read_from_s3(
+ s3_file_name="cleaned_epc_data/cleaned.bson",
+ bucket_name="retrofit-data-dev"
+ )
+ cleaned = msgpack.unpackb(cleaned, raw=False)
+
+ # TODO: We should break out the identification of work types to flag blocks of flats specifically
+ asset_list.identify_worktypes(cleaned)
+
+ pprint(asset_list.work_type_figures)
+
+ asset_list.flat_analysis()
+
+ # Store as an excel
+ filename = os.path.join(data_folder, ".".join(data_filename.split(".")[:-1])) + " - Standardised.xlsx"
+ # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data
+
+ with pd.ExcelWriter(filename) as writer:
+ asset_list.standardised_asset_list.to_excel(writer, sheet_name="Standardised Asset List", index=False)
+ asset_list.flat_data.to_excel(writer, sheet_name="Flat Data", index=False)
diff --git a/asset_list/mappings/exising_pv.py b/asset_list/mappings/exising_pv.py
new file mode 100644
index 00000000..06e77bba
--- /dev/null
+++ b/asset_list/mappings/exising_pv.py
@@ -0,0 +1,12 @@
+STANDARD_EXISTING_PV = {
+ "already has PV", "no PV", "unknown"
+}
+
+EXISTING_PV_MAPPINGS = {
+ "NO": "no PV",
+ "YES": "already has PV",
+ "no": "no PV",
+ "yes": "already has PV",
+ True: "already has PV",
+ False: "no PV",
+}
diff --git a/asset_list/mappings/heating_systems.py b/asset_list/mappings/heating_systems.py
new file mode 100644
index 00000000..4879efcc
--- /dev/null
+++ b/asset_list/mappings/heating_systems.py
@@ -0,0 +1,67 @@
+import numpy as np
+
+STANDARD_HEATING_SYSTEMS = {
+ "gas combi boiler",
+ "electric storage heaters",
+ "district heating",
+ "gas condensing boiler",
+ "oil boiler",
+ "gas condensing combi",
+ "air source heat pump",
+ "boiler - other fuel",
+ "ground source heat pump",
+ "electric radiators",
+ "other",
+ "electric boiler",
+ "unknown",
+ "communal gas boiler",
+ "high heat retention storage heaters",
+}
+
+HEATING_MAPPINGS = {
+ "Combi - GAS": "gas combi boiler",
+ "E7 Storage Heaters": "electric storage heaters",
+ "District heating system": "district heating",
+ "Condensing Boiler - GAS": "gas condensing boiler",
+ "Boiler Oil/other": "oil boiler",
+ "Condensing Combi - Gas": "gas condensing combi",
+ "Air Source Source Heat Pump": "air source heat pump",
+ "Biomass Boiler": "boiler - other fuel",
+ "Ground Source Heat Pump": "ground source heat pump",
+ "Electric Oil filled radiators": "electric radiators",
+ "Solid Fuel": "other",
+ "LPG Boiler": "boiler - other fuel",
+ "Electric Boiler": "electric boiler",
+ "No data": "unknown",
+ "Boiler Communal/Commercial - GAS": "communal gas boiler",
+ "Eco Electric Radiators": "electric radiators",
+ "Gas fire": "other",
+ "Backboiler - Solid fuel": "other",
+ 'combi - gas': 'gas combi boiler',
+ 'e7 storage heaters': 'electric storage heaters',
+ 'district heating system': 'district heating',
+ 'condensing boiler - gas': 'gas condensing boiler',
+ 'boiler oil/other': 'oil boiler',
+ 'condensing combi - gas': 'gas condensing combi',
+ 'air source source heat pump': 'air source heat pump',
+ 'biomass boiler': 'boiler - other fuel',
+ 'ground source heat pump': 'ground source heat pump',
+ 'electric oil filled radiators': 'electric radiators',
+ 'solid fuel': 'other',
+ 'lpg boiler': 'boiler - other fuel',
+ 'electric boiler': 'electric boiler',
+ 'no data': 'unknown', 'boiler communal/commercial - gas': 'communal gas boiler',
+ 'eco electric radiators': 'electric radiators',
+ 'gas fire': 'other', 'backboiler - solid fuel': 'other',
+ 'ASHP': 'air source heat pump',
+ 'COMMHEAT': 'communal gas boiler',
+ 'GBB': 'gas combi boiler',
+ 'GFS': 'gas condensing boiler',
+ 'GWA': 'gas condensing boiler',
+ 'GWM': 'gas condensing combi',
+ 'HDU': 'district heating',
+ 'OILBLR': 'oil boiler',
+ 'SOLIDFUEL': 'boiler - other fuel',
+ 'STORHTR': 'electric storage heaters',
+ np.nan: 'unknown',
+}
diff --git a/asset_list/mappings/property_type.py b/asset_list/mappings/property_type.py
new file mode 100644
index 00000000..2612f058
--- /dev/null
+++ b/asset_list/mappings/property_type.py
@@ -0,0 +1,25 @@
+# These are the standard categories for property types
+STANDARD_PROPERTY_TYPES = {
+ "house", "flat", "maisonette", "bungalow", "park home", "block house", "bedsit", "coach house",
+ "unknown", "other", "block of flats"
+}
+
+# This is a basic mapping that we use to map values that we've seen commonly to standard values
+PROPERTY_MAPPING = {
+ "HOUSE": "house",
+ "FLAT": "flat",
+ "MAISONET": "maisonette",
+ "BUNGALOW": "bungalow",
+ "BLKHOUS": "block house",
+ "blkhous": "block house",
+ "BEDSIT": "bedsit",
+ "COACHSE": "coach house",
+ "coachse": "coach house",
+ 'Admin Unit Type': 'unknown',
+ 'Block': 'block of flats',
+ 'Bungalow': 'bungalow',
+ 'Flat': 'flat',
+ 'House': 'house',
+ 'Maisonette': 'maisonette',
+ 'Stairwell': 'other'
+}
diff --git a/asset_list/mappings/walls.py b/asset_list/mappings/walls.py
new file mode 100644
index 00000000..78d64988
--- /dev/null
+++ b/asset_list/mappings/walls.py
@@ -0,0 +1,92 @@
+STANDARD_WALL_CONSTRUCTIONS = {
+ "uninsulated cavity", "filled cavity", "partial insulated cavity", "cavity unknown insulation",
+ "uninsulated solid brick", "insulated solid brick", "solid brick unknown insulation",
+ "timber frame",
+ "system built", "granite or whinstone", "other", "unknown", "sandstone or limestone",
+ "cob",
+ "new build - average thermal transmittance",
+}
+
+WALL_CONSTRUCTION_MAPPINGS = {
+ "New Build - Average Thermal Transmittance": "new build - average thermal transmittance",
+ 'Average thermal transmittance 0.25 W/m?K': 'unknown',
+ 'Cavity wall, as built, insulated (assumed)': 'filled cavity',
+ 'Average thermal transmittance 0.31 W/m?K': 'unknown',
+ 'Cavity wall, as built, no insulation (assumed)': 'uninsulated cavity',
+ 'Average thermal transmittance 0.30 W/m?K': 'unknown', 'Average thermal transmittance 0.28 W/m-¦K': 'unknown',
+ 'Average thermal transmittance 0.25 W/m-¦K': 'unknown', 'Average thermal transmittance 0.21 W/m-¦K': 'unknown',
+ 'Average thermal transmittance 0.20 W/m-¦K': 'unknown', 'Average thermal transmittance 0.29 W/m?K': 'unknown',
+ 'Average thermal transmittance 0.16 W/m?K': 'unknown',
+ 'Average thermal transmittance 0.27 W/m²K': 'unknown',
+ 'Average thermal transmittance 0.15 W/m-¦K': 'unknown', 'Average thermal transmittance 0.23 W/m-¦K': 'unknown',
+ 'Average thermal transmittance 0.18 W/m?K': 'unknown',
+ 'Granite or whin, with internal insulation': 'granite or whinstone',
+ "Granite or whinstone, as built, insulated (assumed)": "granite or whinstone",
+ 'Average thermal transmittance 0.22 W/m-¦K': 'unknown', 'Average thermal transmittance 0.24 W/m?K': 'unknown',
+ 'Average thermal transmittance 0.16 W/m-¦K': 'unknown', 'Average thermal transmittance 0.35 W/m?K': 'unknown',
+ 'Average thermal transmittance 0.26 W/m-¦K': 'unknown', 'Average thermal transmittance 0.62 W/m?K': 'unknown',
+ 'Average thermal transmittance 0.64 W/m?K': 'unknown', 'Average thermal transmittance 0.61 W/m?K': 'unknown',
+ 'Sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone',
+ 'Average thermal transmittance 0.33 W/m?K': 'unknown',
+ 'Cavity wall,': "cavity unknown insulation",
+ 'Cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity',
+ 'Average thermal transmittance 0.29 W/m-¦K': 'unknown', 'Average thermal transmittance 0.32 W/m-¦K': 'unknown',
+ 'Average thermal transmittance 0.19 W/m-¦K': 'unknown', 'Average thermal transmittance 0.27 W/m?K': 'unknown',
+ 'Average thermal transmittance 0.22 W/m?K': 'unknown', 'Average thermal transmittance 0.38 W/m?K': 'unknown',
+ 'Average thermal transmittance 0.26 W/m?K': 'unknown', 'Average thermal transmittance 0.27 W/m-¦K': 'unknown',
+ 'Average thermal transmittance 0.18 W/m-¦K': 'unknown', 'Average thermal transmittance = 0.27 W/m?K': 'unknown',
+ 'Cavity wall, with external insulation': 'filled cavity', 'Average thermal transmittance 0.21 W/m?K': 'unknown',
+ 'Average thermal transmittance 0.23 W/m?K': 'unknown', 'Average thermal transmittance 0.20 W/m?K': 'unknown',
+ 'Average thermal transmittance 0.32 W/m?K': 'unknown', 'Average thermal transmittance 0.24 W/m-¦K': 'unknown',
+ 'Cavity wall, with internal insulation': 'filled cavity',
+ 'Average thermal transmittance 0.17 W/m-¦K': 'unknown', 'Average thermal transmittance 0.28 W/m?K': 'unknown',
+ 'new build - average thermal transmittance': 'new build - average thermal transmittance',
+ 'average thermal transmittance 0.25 w/m?k': 'unknown',
+ 'cavity wall, as built, insulated (assumed)': 'filled cavity',
+ 'average thermal transmittance 0.31 w/m?k': 'unknown',
+ 'cavity wall, as built, no insulation (assumed)': 'uninsulated cavity',
+ 'average thermal transmittance 0.30 w/m?k': 'unknown', 'average thermal transmittance 0.28 w/m-¦k': 'unknown',
+ 'average thermal transmittance 0.25 w/m-¦k': 'unknown', 'average thermal transmittance 0.21 w/m-¦k': 'unknown',
+ 'average thermal transmittance 0.20 w/m-¦k': 'unknown', 'average thermal transmittance 0.29 w/m?k': 'unknown',
+ 'average thermal transmittance 0.16 w/m?k': 'unknown', 'average thermal transmittance 0.27 w/m²k': 'unknown',
+ 'average thermal transmittance 0.15 w/m-¦k': 'unknown', 'average thermal transmittance 0.23 w/m-¦k': 'unknown',
+ 'average thermal transmittance 0.18 w/m?k': 'unknown',
+ 'granite or whin, with internal insulation': 'granite or whinstone',
+ 'average thermal transmittance 0.22 w/m-¦k': 'unknown', 'average thermal transmittance 0.24 w/m?k': 'unknown',
+ 'average thermal transmittance 0.16 w/m-¦k': 'unknown', 'average thermal transmittance 0.35 w/m?k': 'unknown',
+ 'average thermal transmittance 0.26 w/m-¦k': 'unknown', 'average thermal transmittance 0.62 w/m?k': 'unknown',
+ 'average thermal transmittance 0.64 w/m?k': 'unknown', 'average thermal transmittance 0.61 w/m?k': 'unknown',
+ 'sandstone or limestone, as built, no insulation (assumed)': 'sandstone or limestone',
+ 'average thermal transmittance 0.33 w/m?k': 'unknown', 'cavity wall,': "cavity unknown insulation",
+ 'cavity wall, as built, partial insulation (assumed)': 'partial insulated cavity',
+ 'average thermal transmittance 0.29 w/m-¦k': 'unknown', 'average thermal transmittance 0.32 w/m-¦k': 'unknown',
+ 'average thermal transmittance 0.19 w/m-¦k': 'unknown', 'average thermal transmittance 0.27 w/m?k': 'unknown',
+ 'average thermal transmittance 0.22 w/m?k': 'unknown', 'average thermal transmittance 0.38 w/m?k': 'unknown',
+ 'average thermal transmittance 0.26 w/m?k': 'unknown', 'average thermal transmittance 0.27 w/m-¦k': 'unknown',
+ 'average thermal transmittance 0.18 w/m-¦k': 'unknown', 'average thermal transmittance = 0.27 w/m?k': 'unknown',
+ 'cavity wall, with external insulation': 'filled cavity', 'average thermal transmittance 0.21 w/m?k': 'unknown',
+ 'average thermal transmittance 0.23 w/m?k': 'unknown', 'average thermal transmittance 0.20 w/m?k': 'unknown',
+ 'average thermal transmittance 0.32 w/m?k': 'unknown', 'average thermal transmittance 0.24 w/m-¦k': 'unknown',
+ 'cavity wall, with internal insulation': 'filled cavity', 'average thermal transmittance 0.17 w/m-¦k': 'unknown',
+ 'average thermal transmittance 0.28 w/m?k': 'unknown',
+ 'Cavity wall, filled cavity': 'filled cavity',
+ 'Cavity wall, filled cavity and external insulation': 'filled cavity',
+ 'Granite or whinstone, as built, no insulation (assumed)': 'granite or whinstone',
+ 'Solid brick, as built, insulated (assumed)': 'insulated solid brick',
+ 'Solid brick, as built, no insulation (assumed)': 'uninsulated solid brick',
+ 'Solid brick, with external insulation': 'insulated solid brick',
+ 'Solid brick, with internal insulation': 'insulated solid brick',
+ 'System built, as built, insulated (assumed)': 'system built',
+ 'System built, as built, no insulation (assumed)': 'system built',
+ 'System built, with external insulation': 'system built',
+ 'System built, with internal insulation': 'system built',
+ 'Timber frame, as built, insulated (assumed)': 'timber frame',
+ 'Timber frame, as built, no insulation (assumed)': 'timber frame',
+ 'Timber frame, as built, partial insulation (assumed)': 'timber frame',
+ 'Timber frame, with additional insulation': 'timber frame',
+ 'CAVITY': 'cavity unknown insulation',
+ 'COMB': 'unknown',
+ 'NONE': 'unknown',
+ 'NOTKNOWN': 'unknown',
+ 'SOLID': 'solid brick unknown insulation',
+}
diff --git a/asset_list/requirements.txt b/asset_list/requirements.txt
new file mode 100644
index 00000000..fd43ac64
--- /dev/null
+++ b/asset_list/requirements.txt
@@ -0,0 +1,12 @@
+postal
+pandas
+usaddress
+pydantic-settings==2.6.0
+epc-api-python==1.0.2
+fuzzywuzzy
+boto3
+openpyxl
+openai
+tiktoken
+msgpack
+beautifulsoup4
\ No newline at end of file
diff --git a/asset_list/tests/test_standardisation.py b/asset_list/tests/test_standardisation.py
new file mode 100644
index 00000000..b6d9a391
--- /dev/null
+++ b/asset_list/tests/test_standardisation.py
@@ -0,0 +1,5 @@
+from asset_list.AssetList import AssetList
+
+
+def test_multi_unit_address_flagging():
+ assert AssetList._identify_multi_address('Block (Rooms 1-4), 23 Clifton Hill, Newtown, Exeter, EX1 2DL')
diff --git a/backend/Funding.py b/backend/Funding.py
index f0780c51..2839c7ff 100644
--- a/backend/Funding.py
+++ b/backend/Funding.py
@@ -149,7 +149,8 @@ class Funding:
:return:
"""
measure_table = pd.DataFrame([
- m for m in self.recommendations if m in measures and m["default"]
+ m for m in self.recommendations if
+ (m["type"] in measures) or (m["measure_type"] in measures) and m["default"]
])
measure_table["post_install_sap"] = measure_table["sap_points"] + self.starting_sap
@@ -180,13 +181,10 @@ class Funding:
measure_table["cost_minus_funding"] = measure_table["total"] - measure_table["estimated_funding"]
measure_table["cost_minus_funding_per_sap"] = measure_table["cost_minus_funding"] / measure_table["sap_points"]
measure_table = measure_table.sort_values(["cost_minus_funding_per_sap", "total"], ascending=[True, False])
- # Recommend the measure, with estimated funding amount
- recommended_measure = measure_table.head(1)
- return {
- "measure_type": recommended_measure["measure_type"],
- "estimated_funding": recommended_measure["estimated_funding"]
- }
+ return measure_table[
+ ["type", "measure_type", "Cost Savings", "estimated_funding"]
+ ].rename(columns={"Cost Savings": "project_score"}).to_dict("records")
def sap_to_eco_band(self, sap_points):
"""
diff --git a/backend/Property.py b/backend/Property.py
index a495431f..eaffd54d 100644
--- a/backend/Property.py
+++ b/backend/Property.py
@@ -395,6 +395,7 @@ class Property:
primary_recommendation_id=rec["recommendation_id"],
non_invasive_recommendations=self.non_invasive_recommendations,
)
+
self.recommendations_scoring_data.append(scoring_dict)
simulation_epc = self.epc_record.prepared_epc.copy()
@@ -1258,6 +1259,12 @@ class Property:
if (self.building_id is not None) and (self.solar_panel_configuration is not None):
return True
+ # If the property is in a conservation area, is listed or is a heriage building, solar panels
+ # become a difficult measure to generally get through planning restrictions and so we do not recommend
+ # solar panels
+ if self.restricted_measures:
+ return False
+
is_valid_property_type = self.data["property-type"] in ["House", "Bungalow", "Maisonette"]
is_valid_roof_type = (
self.roof["is_flat"] or self.roof["is_pitched"] or self.roof["is_roof_room"]
diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py
index c74a0b1f..0d921bec 100644
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@@ -208,9 +208,14 @@ class SearchEpc:
try:
# Updated regex to catch house numbers including alphanumeric ones
pattern = r'(?i)(?:flat|apartment)\s*(\d+\w*)|^\s*(\d+\w*)'
- match = re.search(pattern, address)
- if match:
- return next(g for g in match.groups() if g is not None)
+ match1 = re.search(pattern, address)
+ if match1:
+ return next(g for g in match1.groups() if g is not None)
+
+ pattern2 = r'(?i)(flat|apartment)\s*([a-zA-Z]?\d+[a-zA-Z]?)'
+ match2 = re.search(pattern2, address)
+ if match2:
+ return match2.group(2)
parsed = usaddress.parse(address)
# First, try to get the 'OccupancyIdentifier' if 'OccupancyType' is detected
@@ -221,7 +226,8 @@ class SearchEpc:
continue
if part == postcode.split(" ")[1]:
continue
- return part # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary
+ return part.rstrip(
+ ",") # This assumes the first 'OccupancyIdentifier' after 'OccupancyType' is the primary
# number
# Fallback to 'AddressNumber' if no 'OccupancyIdentifier' is found
@@ -331,6 +337,9 @@ class SearchEpc:
if row["lmk-key"] not in seen and not seen.add(row["lmk-key"])
]
+ if data["rows"]:
+ api_response["msg"] = self.SUCCESS
+
return api_response["msg"]
def filter_rows(self, rows, property_type=None, address=None):
diff --git a/backend/app/assumptions.py b/backend/app/assumptions.py
index 841ec2c1..8d0c05be 100644
--- a/backend/app/assumptions.py
+++ b/backend/app/assumptions.py
@@ -54,4 +54,5 @@ DESCRIPTIONS_TO_FUEL_TYPES = {
"Gas instantaneous at point of use": {"fuel": "Natural Gas", "cop": 0.85},
"Room heaters, wood logs": {"fuel": "Wood Logs", "cop": 1},
"Boiler and radiators, coal": {"fuel": "Coal", "cop": 0.85},
+ "From main system, no cylinderstat": {"fuel": "Natural Gas", "cop": 0.85},
}
diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py
index 04a2ef7f..d82e774b 100644
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@@ -338,7 +338,7 @@ def extract_property_request_data(
# Because we have some non-invasive recommendations that match on address and postcode, but not UPRN
# we need to check existence of uprn
- has_uprn = "uprn" in non_invasive_recommendations[0] if non_invasive_recommendations else True
+ has_uprn = "uprn" in non_invasive_recommendations[0] if non_invasive_recommendations else False
if has_uprn:
has_uprn = non_invasive_recommendations[0]["uprn"] not in ["", None]
@@ -370,7 +370,7 @@ def extract_property_request_data(
property_non_invasive_recommendations["recommendations"] = str(transformed)
# Check if the valuation data has uprn
- valuation_has_uprn = "uprn" in valuation_data[0] if valuation_data else True
+ valuation_has_uprn = "uprn" in valuation_data[0] if valuation_data else False
if valuation_has_uprn:
valuation_has_uprn = valuation_data[0]["uprn"] not in ["", None]
@@ -639,8 +639,10 @@ async def trigger_plan(body: PlanTriggerRequest):
recommendations_scoring_data = pd.DataFrame(recommendations_scoring_data)
recommendations_scoring_data = recommendations_scoring_data.drop(
- columns=["rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending",
- "carbon_ending"]
+ columns=[
+ "rdsap_change", "heat_demand_change", "carbon_change", "sap_ending", "heat_demand_ending",
+ "carbon_ending"
+ ]
)
all_predictions = await model_api.async_paginated_predictions(
@@ -692,7 +694,8 @@ async def trigger_plan(body: PlanTriggerRequest):
Recommendations.calculate_recommendation_tenant_savings(
property_instance=property_instance,
kwh_simulation_predictions=kwh_simulation_predictions,
- property_recommendations=property_recommendations
+ property_recommendations=property_recommendations,
+ ashp_cop=body.ashp_cop
)
)
property_instance.current_energy_bill = property_current_energy_bill
@@ -822,7 +825,7 @@ async def trigger_plan(body: PlanTriggerRequest):
property_recommendations=recommendations[p.id],
project_scores_matrix=eco_project_scores_matrix,
whlg_eligible_postcodes=whlg_eligible_postcodes,
- gbis_abs_rate=20,
+ gbis_abs_rate=15,
eco4_abs_rate=15,
)
funding_calulator.check_eligibiltiy()
diff --git a/backend/app/plan/schemas.py b/backend/app/plan/schemas.py
index f84912fe..618bec90 100644
--- a/backend/app/plan/schemas.py
+++ b/backend/app/plan/schemas.py
@@ -80,3 +80,5 @@ class PlanTriggerRequest(BaseModel):
multi_plan: Optional[bool] = False
optimise: Optional[bool] = True
default_u_values: Optional[bool] = True
+
+ ashp_cop: Optional[float] = 2.8
diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py
index 720005d3..6d4852b2 100644
--- a/backend/ml_models/Valuation.py
+++ b/backend/ml_models/Valuation.py
@@ -1,5 +1,4 @@
import numpy as np
-from scipy.constants import value
class PropertyValuation:
@@ -216,6 +215,30 @@ class PropertyValuation:
cls.UPRN_VALUE_LOOKUP.get(property_instance.uprn)
)
+ current_epc = property_instance.data["current-energy-rating"]
+
+ if not current_value:
+ return {
+ "current_value": 0,
+ "lower_bound_increased_value": 0,
+ "upper_bound_increased_value": 0,
+ "average_increased_value": 0,
+ "average_increase": 0
+ }
+
+ return cls.estimate_valuation_improvement(current_value, current_epc, target_epc, total_cost)
+
+ @classmethod
+ def estimate_valuation_improvement(cls, current_value, current_epc, target_epc, total_cost=None):
+ """
+ This function estimates the value of a property based on the current EPC rating and the target EPC rating
+ :param current_value:
+ :param current_epc:
+ :param target_epc:
+ :param total_cost:
+ :return:
+ """
+
if not current_value:
return {
"current_value": 0,
@@ -225,7 +248,6 @@ class PropertyValuation:
"average_increase": 0
}
- current_epc = property_instance.data["current-energy-rating"]
# We get the spectrum of ratings between the current and target EPC
epc_band_range = cls.EPC_BANDS[cls.EPC_BANDS.index(current_epc): cls.EPC_BANDS.index(target_epc) + 1]
diff --git a/backend/tests/test_search_epc.py b/backend/tests/test_search_epc.py
index 3b2e2a5b..562585ad 100644
--- a/backend/tests/test_search_epc.py
+++ b/backend/tests/test_search_epc.py
@@ -48,3 +48,12 @@ class TestSearchEpcIntegration:
assert epc_searcher.newest_epc["lmk-key"] == lmk_key
assert epc_searcher.newest_epc["uprn"] == uprn
assert len(epc_searcher.older_epcs) == n_old_epcs
+
+ def test_search_housenumber(self):
+ eg1 = 'Flat A11, Mortimer House, Grendon Road, Exeter'
+ res1 = SearchEpc.get_house_number(eg1, None)
+ assert res1 == "A11"
+
+ eg2 = 'Flat A9, Mortimer House, Grendon Road, Exeter, EX1 2NL'
+ res2 = SearchEpc.get_house_number(eg2, None)
+ assert res2 == "A9"
diff --git a/etl/customers/l_and_g/ic_slides.py b/etl/customers/l_and_g/ic_slides.py
index 72dfc2c0..a5cb3511 100644
--- a/etl/customers/l_and_g/ic_slides.py
+++ b/etl/customers/l_and_g/ic_slides.py
@@ -132,7 +132,7 @@ def get_data(portfolio_id, scenario_ids):
return properties_data, plans_data, recommendations_data
-properties_data, plans_data, recommendations_data = get_data(portfolio_id=124, scenario_ids=[199])
+properties_data, plans_data, recommendations_data = get_data(portfolio_id=124, scenario_ids=[205])
properties_df = pd.DataFrame(properties_data)
plans_df = pd.DataFrame(plans_data)
@@ -240,4 +240,7 @@ df["Predicted Post Works SAP"] = df["Current SAP Points"] + df["sap_points"]
df["Predicted Post Works SAP"] = df["Predicted Post Works SAP"].round()
df["Predicted Post Works EPC"] = df["Predicted Post Works SAP"].apply(lambda x: sap_to_epc(x))
+df["Recommendation: Air Source Heat Pump"].sum()
+df["Cost: Air Source Heat Pump"].sum()
+
df.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/L&G/Basildon Data Export - 2.csv", index=False)
diff --git a/etl/customers/lambeth/re-knocks.py b/etl/customers/lambeth/re-knocks.py
new file mode 100644
index 00000000..1de91b50
--- /dev/null
+++ b/etl/customers/lambeth/re-knocks.py
@@ -0,0 +1,23 @@
+import pandas as pd
+
+data = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Downloads/Lambeth Reknocks.xlsx", sheet_name="Possible Route",
+ header=1
+)
+
+data["Outcomes"].value_counts()
+
+# Strip out: No
+
+df = data[data["Outcomes"] == "See notes"]
+notes_df = df[
+ ("Notes (If 'no answer' under outcomes, have you checked around the property for access issues where "
+ "possible?)")].value_counts().to_frame()
+
+example = df[df["Notes (If 'no answer' under outcomes, have you checked around the property for access issues where "
+ "possible?)"] == ('Access to rear of property only through number 10. Overgrown athe rear of property '
+ 'installer wont be able to access')
+ ]
+
+# 18 did not attend
+#
diff --git a/etl/customers/panacap/assets.py b/etl/customers/panacap/assets.py
new file mode 100644
index 00000000..ec57d9a4
--- /dev/null
+++ b/etl/customers/panacap/assets.py
@@ -0,0 +1,61 @@
+import os
+
+import pandas as pd
+from dotenv import load_dotenv
+
+from etl.spatial.OpenUprnClient import OpenUprnClient
+from etl.route_march_data_pull.app import get_data
+
+load_dotenv(dotenv_path="backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+addresses = [
+ {"address": "3 Willis Road", "postcode": "CB1 2AQ"},
+ {"address": "22 Catharine Street", "postcode": "CB1 3AW"},
+ {"address": "332 Mill Road", "postcode": "CB1 3NN"},
+ {"address": "330 Mill Road", "postcode": "CB1 3NN"},
+ {"address": "328 Mill Road", "postcode": "CB1 3NN"},
+ {"address": "71 Mill Road", "postcode": "CB1 2AS"},
+ {"address": "78 Argyle Street", "postcode": "CB1 3LZ"},
+ {"address": "9 Graham Road", "postcode": "CB4 2ZE"},
+ {"address": "217 Mill Road", "postcode": "CB1 3BE"},
+ {"address": "374 Mill Road", "postcode": "CB1 3NN"},
+ {"address": "174 Thoday Street", "postcode": "CB1 3AX"},
+ {"address": "37 Abbey Road", "postcode": "CB5 8HH"},
+ {"address": "18 Upper Gwydir Street", "postcode": "CB1 2LR"},
+ {"address": "21 Fulbourn Road Fulbourn", "postcode": "CB1 9JL"},
+ {"address": "108 Argyle Street", "postcode": "CB1 3LS"},
+ {"address": "115 Victoria Road", "postcode": "CB4 3BS"},
+ {"address": "55 Ross Street", "postcode": "CB1 3BP"},
+ {"address": "16 Kingston Street", "postcode": "CB1 2NU"},
+ {"address": "13 Thoday Street", "postcode": "CB1 3AS"},
+ {"address": "103 York Street", "postcode": "CB1 2PZ"},
+]
+
+asset_list = pd.DataFrame(addresses)
+asset_list["row_id"] = asset_list.index
+
+epc_data, _, _ = get_data(
+ asset_list=asset_list, fulladdress_column="address", postcode_column="postcode", address1_column="address",
+ manual_uprn_map={}, epc_api_only=True
+)
+
+epc_df = pd.DataFrame(epc_data)
+epc_df.shape
+
+asset_list = asset_list.merge(
+ epc_df, how="left", on="row_id"
+)
+
+asset_list = asset_list.rename(columns={"address_x": "Address", "postcode_x": "Postcode"})
+asset_list["uprn"] = asset_list["uprn"].astype(str)
+
+spatial_data = OpenUprnClient.get_spatial_data([x["uprn"] for x in epc_data], bucket_name="retrofit-data-dev")
+spatial_data["UPRN"] = spatial_data["UPRN"].astype(str)
+
+asset_list = asset_list.merge(
+ spatial_data, how="left", left_on="uprn", right_on="UPRN"
+)
+
+asset_list.to_csv("/Users/khalimconn-kowlessar/Documents/hestia/Customers/Panacap/Acquisitions EPC Data.csv",
+ index=False)
diff --git a/etl/customers/remote_assessments/app.py b/etl/customers/remote_assessments/app.py
index 13cdc41b..fc3b7ec6 100644
--- a/etl/customers/remote_assessments/app.py
+++ b/etl/customers/remote_assessments/app.py
@@ -4,7 +4,7 @@ from dotenv import load_dotenv
from utils.s3 import save_csv_to_s3
from etl.find_my_epc.AssetListEpcData import AssetListEpcData
-PORTFOLIO_ID = 126
+PORTFOLIO_ID = 134
USER_ID = 8
load_dotenv(dotenv_path="backend/.env")
@@ -19,22 +19,24 @@ def app():
asset_list = [
{
- "address": "Garden Flat, 48 Bedminster Parade",
- "postcode": "BS3 4HS",
- "building_id": 1,
- "uprn": 308249,
+ "address": "Flat 2, 42 Malden Road, London NW5 3HG",
+ "postcode": "NW5 3HG",
+ "uprn": 5117165,
},
{
- "address": "Top Floor Flat, 48 Bedminster Parade",
- "postcode": "BS3 4HS",
- "building_id": 1,
- "uprn": 308251
+ "address": "15 Bournville Lane",
+ "postcode": "B30 2JY",
+ "uprn": 100070301128
},
{
- "address": "First Floor Flat, 48 Bedminster Parade",
- "postcode": "BS3 4HS",
- "building_id": 1,
- "uprn": 308250,
+ "address": "34 Bournville Lane",
+ "postcode": "B30 2LN",
+ "uprn": 100070301140
+ },
+ {
+ "address": "36 Bournville Lane",
+ "postcode": "B30 2LN",
+ "uprn": 100070301142
}
]
asset_list = pd.DataFrame(asset_list)
@@ -65,20 +67,21 @@ def app():
valuation_data = [
{
- "address": "Garden Flat, 48 Bedminster Parade",
- "postcode": "BS3 4HS",
- "valuation": 337_000
+ "uprn": 5117165,
+ "valuation": 467_000
},
{
- "addresss": "Top Floor Flat, 48 Bedminster Parade",
- "postcode": "BS3 4HS",
- "valuation": 337_000
+ "uprn": 100070301128,
+ "valuation": 335_000
},
{
- "address": "First Floor Flat, 48 Bedminster Parade",
- "postcode": "BS3 4HS",
- "valuation": 337_000
- }
+ "uprn": 100070301140,
+ "valuation": 276_000
+ },
+ {
+ "uprn": 100070301142,
+ "valuation": 276_000
+ },
]
# Store valuation data to s3
valuation_filename = f"{USER_ID}/{PORTFOLIO_ID}/valuation.csv"
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
index 8538188b..95fe4fcd 100644
--- a/etl/customers/stonewater/Wave 3 Preparation.py
+++ b/etl/customers/stonewater/Wave 3 Preparation.py
@@ -1,4 +1,7 @@
import os
+from urllib import parse
+from fuzzywuzzy import fuzz
+
import PyPDF2
import re
import pandas as pd
@@ -128,6 +131,7 @@ def extract_summary_report(pdf_path):
"Current SAP Rating": None,
"Current EPC Band": None,
"Fuel Bill": None,
+ "Main Building Age Band": None,
"Number of Storeys": None,
"Window Age Description": None,
"Window Age Description Proportion (%)": None,
@@ -177,6 +181,13 @@ def extract_summary_report(pdf_path):
sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)
data["Current SAP Rating"] = sap_match.group(1).split(" ")[1]
+ # Extract age
+ age_band_match = re.search(
+ r"3\.0 Date Built:\s*Main Property\s*[A-Z]?\s*(\d{4}-\d{4}|before \d{4}|\d{4} onwards)",
+ text
+ )
+ data["Main Building Age Band"] = age_band_match.group(1)
+
# Number of storeys
storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
data["Number of Storeys"] = int(storeys_match.group(1))
@@ -465,7 +476,11 @@ def extract_building_parts_summary(text):
r"Dimensions:\s*Dimension type: Internal\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL
)
if not dimensions_section:
- raise ValueError("Failed to locate dimensions section in the text.")
+ dimensions_section = re.search(
+ r"Dimensions:\s*Dimension type: External\n(.*?)\n5\.0 Conservatory:", text, re.DOTALL
+ )
+ if not dimensions_section:
+ raise ValueError("Failed to locate dimensions section in the text.")
dimensions_text = dimensions_section.group(1)
@@ -694,6 +709,7 @@ def extract_epr(pdf_path):
"Primary Energy Use (kWh/yr)": None,
"Primary Energy Use Intensity (kWh/m2/yr)": None,
"Number of Storeys": None,
+ "Main Building Age Band": None,
"Fuel Bill": None,
"Window Age Description": None,
"Window Age Description Proportion (%)": None,
@@ -747,12 +763,38 @@ def extract_epr(pdf_path):
# Extract Current and Potential SAP ratings
sap_match = re.search(r"GG \(1-20\)\s*(\d{1,2})\s*(\d{1,2})", text)
- current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2))
- data["Current SAP Rating"] = current_sap
+ if sap_match is None:
+ # Handles the older format of the elmhurst EPR
+ # The text will look something like this:
+ # Least energy efficient - higher running costsD 61 - we extract D 61
+ sap_match = re.search(
+ r"(?P[A-G])\s(?P\d{1,3})(?P[A-G])\s(?P\d{1,3})",
+ text)
+ data["Current EPC Band"] = sap_match.group("current_epc")
+ data["Current SAP Rating"] = int(sap_match.group("current_sap"))
+ else:
+ current_sap, _ = int(sap_match.group(1)), int(sap_match.group(2))
+ data["Current SAP Rating"] = current_sap
# Extract the primary energy use intensity
additional_rating_match = re.search(r"Additional ratings for your home\s*([\d.]+)", text)
- data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1))
+ if additional_rating_match:
+ data["Primary Energy Use Intensity (kWh/m2/yr)"] = float(additional_rating_match.group(1))
+ else:
+ # Handles the older format of the Elmhurst EPR
+ primary_energy_match = re.search(r"actual consumption\.\n(?P\d+)", text)
+ data["Primary Energy Use (kWh/yr)"] = int(primary_energy_match.group("primary_energy"))
+ # We calculate the primary energy use intensity by dividing by floor area
+ floor_area = re.search(r"Total Floor Area\s(?P\d+)\s?m2", text).group("floor_area")
+ data["Primary Energy Use Intensity (kWh/m2/yr)"] = data["Primary Energy Use (kWh/yr)"] / int(floor_area)
+
+ # Extract age band
+ age_band_match = re.search(
+ r"Building part:\s*Main\s*-\s*built in\s*(?:[A-Z]\s*)?(\d{4}-\d{4}|before \d{4}|\d{4} onwards)",
+ text
+ )
+
+ data["Main Building Age Band"] = age_band_match.group(1)
# Extract Number of Storeys
storeys_match = re.search(r"Number of Storeys:\s*(\d+)", text)
@@ -880,11 +922,18 @@ def detect_report_type(pdf_path, pdf_file):
"""
# Attempt to read the first page of the PDF to determine type
with open(pdf_path, "rb") as file:
+ # This code raises some warnings like Multiple definitions in dictionary at byte 0x1ab for key /Filter
+ # This is because the pdf is irregular. We could possibly try a library like fitz to handle this
reader = PyPDF2.PdfReader(file)
first_page_text = reader.pages[0].extract_text() if reader.pages else ""
+ n_pages = len(reader.pages)
- if is_energy_report(first_page_text):
+ if is_energy_report(first_page_text) and n_pages > 3:
+ # The EPR should have more than 3 pages
return "epr"
+ elif is_energy_report(first_page_text) and n_pages <= 3:
+ # This is a shortened version of the EPR which isn't massively useful
+ return "short_form_epr"
elif "summary" in pdf_file.lower() or is_summary_report(first_page_text):
return "summary"
elif is_condition_report(first_page_text):
@@ -1675,7 +1724,6 @@ def append_stonewater_id():
)
model_proposed_sample = model_proposed_sample[~pd.isnull(model_proposed_sample["Address ID"])]
model_proposed_sample["Address ID"] = model_proposed_sample["Address ID"].astype(int)
- z = model_proposed_sample["Archetype ID"].drop_duplicates().sort_values()
original_archetypes = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24 "
@@ -2906,6 +2954,14 @@ def identify_incorrect_packages():
)
+def extract_sharepoint_url(x):
+ if pd.isnull(x):
+ return ""
+ return "/".join(parse.urlparse(
+ x.split(" - http")[1]
+ ).path.replace("%20", " ").split("/")[-2:])
+
+
def revised_model():
"""
This function implements the revised model for Stonewater, where we are looking at new priority postcodes
@@ -2913,7 +2969,6 @@ def revised_model():
"""
# 1) Create the new list of properties
-
new_priority_postcodes = pd.read_excel(
"/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Jan 2025 Project/Updated 2025 to 2030 "
"priority list.xlsx"
@@ -2927,16 +2982,1312 @@ def revised_model():
original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])]
original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"]
original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int)
+ original_archetypes["UPRN"] = original_archetypes["UPRN"].astype("Int64").astype(str)
- original_archetypes = original_archetypes[
- ["Address ID", "Archetype ID", ""]
- ]
+ wave_21_folder_name = "Wave 2.1 Surveys - 2"
# Check if we have all of the addresses
missed = original_archetypes[
~original_archetypes["Address ID"].isin(new_priority_postcodes["Address ID"].values)
]["Archetype ID"].unique()
- assert
+
+ assert set(missed) == {'NOT PRIORITY POSTCODE', 'IN WAVE 2.1', 'EPC C OR ABOVE'}
+
+ original_archetypes = original_archetypes[
+ ["Address ID", "Archetype ID", "Archetype Group Rank", "UPRN"]
+ ]
+
+ # Merge these archetypes on to the new priority postcodes
+ new_priority_postcodes = new_priority_postcodes.merge(
+ original_archetypes, how="left", on="Address ID"
+ )
+
+ # Basic check, should have no rows with missing Archetype ID, where
+ assert float(new_priority_postcodes[pd.isnull(new_priority_postcodes["Archetype ID"])]["Address ID"].isin(
+ original_archetypes["Address ID"]
+ ).sum()) == 0
+
+ # We pull together the survey data sheet
+ survey_folders = []
+
+ # Loop over each survey folder and list its contents
+ for i in range(1, NUM_FOLDERS + 1):
+ folder_path = os.path.join(CUSTOMER_FOLDER_PATH, f"StonewaterSurveys_{i}")
+ if os.path.isdir(folder_path): # Check if folder exists
+ folder_contents = [os.path.join(f"StonewaterSurveys_{i}", file) for file in os.listdir(folder_path)]
+ survey_folders.extend(folder_contents) # Append contents to the master list
+
+ wave_21_folders = [
+ "1. Herefordshire",
+ "2. Bedfordshire",
+ "3. Wiltshire",
+ "4. Bournemouth",
+ "5. Coventry",
+ "6. West Sussex",
+ "7. Dorset",
+ "8. Cambridgeshire",
+ "9. Guildford",
+ "10. Little Island",
+ "11. CCS Dorset"
+ ]
+
+ for wave_2_1_folder in wave_21_folders:
+ folder_path = os.path.join(CUSTOMER_FOLDER_PATH, wave_21_folder_name, wave_2_1_folder)
+ if os.path.isdir(folder_path): # Check if folder exists
+ folder_contents = [os.path.join(wave_21_folder_name, wave_2_1_folder, file) for file in
+ os.listdir(folder_path)]
+ survey_folders.extend(folder_contents) # Append contents to the master list
+
+ # We now do a large pull of all of the data
+ extracted_data = []
+ mtp_extracted_data = [] # Additional data to extract from the medium term plans
+ for survey_folder in tqdm(survey_folders):
+ survey_folder_path = os.path.join(CUSTOMER_FOLDER_PATH, survey_folder)
+
+ # Check that the survey folder is actually a folder
+ if not os.path.isdir(survey_folder_path):
+ continue
+
+ # List the folders inside of the survey folder
+ survey_subfolders = [
+ name for name in os.listdir(survey_folder_path)
+ if os.path.isdir(os.path.join(survey_folder_path, name))
+ ]
+
+ # Check if there's a "retrofit assessment" folder
+ retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None)
+
+ ra_folder = next(
+ (name for name in survey_subfolders if "ra coordinator info" in name.lower() or "ra info" in name.lower()),
+ None
+ )
+
+ mtp_folder = next(
+ (name for name in survey_subfolders if "mid-term" in name.lower() or "mtp" in name.lower()),
+ None
+ )
+ if mtp_folder:
+ # We have a mid term plan:
+ mtp_folder_path = os.path.join(survey_folder_path, mtp_folder)
+ # Get the contents - files and not folder
+ mtp_contents = [
+ os.path.join(mtp_folder, file) for file in os.listdir(mtp_folder_path)
+ if ".DS_Store" not in file and not os.path.isdir(os.path.join(mtp_folder_path, mtp_folder, file))
+ ]
+
+ has_v1 = [
+ f for f in mtp_contents if "v1" in f.lower() or "/ss" in f.lower()
+ ]
+
+ if has_v1:
+ # Then we go one level deeper
+ mtp_contents = [
+ os.path.join(has_v1[0], f) for f in
+ os.listdir(os.path.join(survey_folder_path, has_v1[0]))
+ ]
+
+ # We check the the IMA
+ for file_name in mtp_contents:
+
+ filepath = os.path.join(survey_folder_path, file_name)
+ # We expect a pdf so try and parse it
+ try:
+ with open(filepath, "rb") as file:
+ reader = PyPDF2.PdfReader(file)
+ # Just the first page
+ text = reader.pages[0].extract_text()
+
+ except Exception as e:
+ continue
+
+ # We check if this is an IMA
+ ima_heading_search = re.search(
+ r"Improvement measure\s+Capital Cost\s+Lifetime of\s*\n\s*measureFuel saving\s*Lifetime fuel", text
+ )
+
+ is_ima = bool(ima_heading_search)
+ if not is_ima:
+ continue
+
+ # Otherwise, extract: RIR, PV
+ pv_search = re.search(r"PV \(\d+Kwp\)", text)
+ has_pv = bool(pv_search)
+ pv_system = pv_search.group(0) if has_pv else None
+
+ # We perform a second search for PV:
+ if pv_search is None:
+ pv_search = re.search("solar pv", text.lower())
+ has_pv = bool(pv_search)
+ pv_system = "Solar PV" if has_pv else None
+
+ rir_search = re.search(r"RIR \(\d+(\.\d+)?\)", text)
+ has_rir = bool(rir_search)
+ rir_spec = rir_search.group(0) if has_rir else None
+
+ mtp_extracted_data.append({
+ "survey_folder": survey_folder,
+ "has_pv": has_pv,
+ "PV System": pv_system,
+ "RIR Specification": rir_spec,
+ "has_rir": has_rir
+ })
+ continue
+
+ # If retrofit assessment folder exists, check if it has content
+ if retrofit_folder or ra_folder:
+ if retrofit_folder:
+ retrofit_folder_path = os.path.join(survey_folder_path, retrofit_folder)
+ else:
+ retrofit_folder_path = os.path.join(survey_folder_path, ra_folder)
+
+ # Check if everything inside is a sub-folder and the number of folders is 2
+ items = [item for item in os.listdir(retrofit_folder_path) if item != '.DS_Store']
+ all_folders = [os.path.isdir(os.path.join(retrofit_folder_path, item)) for item in items]
+ if all(all_folders) and len(all_folders) == 2 and "Property Pics" in items:
+ # Get the folder that isn't Property Pics
+ retrofit_folder_path = os.path.join(
+ retrofit_folder_path, [item for item in items if item != "Property Pics"][0]
+ )
+
+ if os.listdir(retrofit_folder_path): # If not empty
+ summary_data = extract_retrofit_pdfs(data_folder_path=retrofit_folder_path)
+ if summary_data:
+ summary_data = {
+ "survey_folder": survey_folder,
+ **summary_data,
+ }
+ extracted_data.append(summary_data)
+ continue
+ else:
+ # Then we have an empty Retrofit Assessment folder
+ continue
+
+ # If no retrofit folder or it was empty, check files in survey_folder
+ summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
+ if not summary_data:
+ if len(survey_subfolders) == 1:
+ survey_folder_path = os.path.join(survey_folder_path, survey_subfolders[0])
+ summary_data = extract_retrofit_pdfs(data_folder_path=survey_folder_path)
+
+ if summary_data:
+ summary_data = {
+ "survey_folder": survey_folder,
+ **summary_data,
+ }
+ extracted_data.append(summary_data)
+
+ retrofit_assessment_data = pd.DataFrame(extracted_data)
+ mtp_df = pd.DataFrame(mtp_extracted_data)
+
+ # Save
+ # retrofit_assessment_data.to_csv(
+ # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 5.csv"), index=False
+ # )
+ # mtp_df.to_csv(
+ # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 5.csv"), index=False
+ # )
+ retrofit_assessment_data = pd.read_csv(
+ os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/Retrofit Assessment Data Sheet 5.csv"),
+ )
+ mtp_df = pd.read_csv(
+ os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project/MTP Data Sheet 5.csv"),
+ )
+
+ # There are a few duplicates we just manually drop
+ mtp_df = mtp_df.drop_duplicates()
+ mtp_df = mtp_df[
+ ~((
+ mtp_df["survey_folder"] == "Wave 2.1 Surveys - 2/1. Herefordshire/(043) Manor Fields 27"
+ ) & (~mtp_df["has_pv"]))
+ ]
+
+ mtp_df = mtp_df[
+ ~((
+ mtp_df["survey_folder"] == "Wave 2.1 Surveys - 2/2. Bedfordshire/(147) Gilpin Close 5"
+ ) & (~mtp_df["has_pv"]))
+ ]
+
+ # Remove some definite duplicates
+ dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].duplicated()]["Address"]
+ dupes = retrofit_assessment_data[retrofit_assessment_data["Address"].isin(dupes)]
+ dupes = dupes.sort_values("Address")
+ # Get all of the folders that end with ROSS
+ to_drop = dupes[dupes["survey_folder"].str.endswith("ROSS")]["survey_folder"].unique().tolist()
+
+ # Replace \n with ""
+ retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "")
+
+ retrofit_assessment_data = retrofit_assessment_data[
+ ~retrofit_assessment_data["survey_folder"].isin(
+ [
+ "StonewaterSurveys_10/4 Beech Road, LUTON, LU1 1DP ROSS",
+ "StonewaterSurveys_2/135 Runley Road, LUTON, LU1 1TX ROSS",
+ "StonewaterSurveys_13/7 Saxon Road, LUTON, LU3 1JR ROSS"
+ ] + to_drop
+ )
+ ]
+
+ retrofit_assessments_data_columns = [
+ 'Current SAP Rating', 'Current EPC Band', 'Primary Energy Use (kWh/yr)',
+ 'Primary Energy Use Intensity (kWh/m2/yr)', 'Number of Storeys',
+ 'Fuel Bill', 'Window Age Description',
+ 'Window Age Description Proportion (%)',
+ 'Secondary Window Age Description',
+ 'Secondary Window Age Description Proportion (%)', 'Number of Windows',
+ 'Total Number of Doors', 'Number of Insulated Doors',
+ 'Existing Primary Heating System',
+ 'Existing Primary Heating PCDF Reference',
+ 'Existing Primary Heating Controls',
+ 'Existing Primary Heating % of Heat',
+ 'Existing Secondary Heating System',
+ 'Existing Secondary Heating PCDF Reference',
+ 'Existing Secondary Heating Controls',
+ 'Existing Secondary Heating % of Heat', 'Secondary Heating Code',
+ 'Water Heating Code', 'Total Floor Area (m2)',
+ 'Total Ground Floor Area (m2)', 'RIR Floor Area',
+ 'Main Building Wall Area (m2)', 'First Extension Wall Area (m2)',
+ 'Number of Light Fittings', 'Number of LEL Fittings',
+ 'Number of fittings needing LEL', 'Main Roof Type',
+ 'Main Roof Insulation', 'Main Roof Insulation Thickness',
+ 'Main Wall Type', 'Main Wall Insulation', 'Main Wall Dry-lining',
+ 'Main Wall Thickness', 'Main Building Alternative Wall Type',
+ 'Main Building Alternative Wall Insulation',
+ 'Main Building Alternative Wall Dry-lining',
+ 'Main Building Alternative Wall Thickness',
+ 'Main Fuel',
+ 'Main Building Age Band',
+ ]
+ # For the columns in retrofit_assessments_data_columns, prefix all of them with Survey:
+ retrofit_assessments_data_columns_prefixed = ["Survey: " + x for x in retrofit_assessments_data_columns]
+ rename_dict = dict(zip(retrofit_assessments_data_columns, retrofit_assessments_data_columns_prefixed))
+ retrofit_assessment_data = retrofit_assessment_data.rename(columns=rename_dict)
+ retrofit_assessment_data["Survey: Current EPC Band"] = (
+ retrofit_assessment_data["Survey: Current SAP Rating"].apply(lambda x: sap_to_epc(x))
+ )
+
+ # We can read in the data as needed
+
+ # Next Step: Read in the coordinated measures and match to the extracted data
+ ############################################################
+ # CCS
+ #############################################################
+ ccs_coordination_sheet = pd.read_excel(
+ os.path.join(
+ CUSTOMER_FOLDER_PATH,
+ "Jan 2025 Project",
+ "CCS_Installation_Compliance_-_Stonewater_SHDF_2_1_1738228227.xlsx"
+ ),
+ header=4
+ )
+ ccs_postcodes = pd.read_excel(
+ os.path.join(
+ CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "CCS_Installation_Compliance_CCS.xlsx"
+ ),
+ header=4
+ )
+ ccs_coordination_sheet = ccs_postcodes[['Name', 'Post Code', 'Asset ID', 'Asset ID.1']].merge(
+ ccs_coordination_sheet, how="left", on="Name"
+ )
+ ccs_coordination_sheet = ccs_coordination_sheet[~pd.isnull(ccs_coordination_sheet["Name"])]
+ ccs_coordination_sheet["contractor"] = "CCS"
+ # We split ccs into two sections - the first being
+ ccs_coordination_removed_from_programme = ccs_coordination_sheet.tail(21)
+ ccs_coordination_sheet = ccs_coordination_sheet.head(87)
+ ccs_coordination = pd.concat([ccs_coordination_removed_from_programme, ccs_coordination_sheet])
+
+ ccs_coordination["folder_path"] = ccs_coordination["Sharepoint Link"].apply(lambda x: extract_sharepoint_url(x))
+
+ ############################################################
+ # WATES
+ #############################################################
+ wates_coordination_sheet = pd.read_excel(
+ os.path.join(
+ CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "Stonewater_SAP_Installation_Compliance_1738229226.xlsx"
+ ),
+ header=4
+ )
+ wates_postcodes = pd.read_excel(
+ os.path.join(
+ CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "Stonewater_SAP_Installation_Compliance_Vinci-Wates.xlsx"
+ ),
+ header=4
+ )
+ wates_postcodes = wates_postcodes[~pd.isnull(wates_postcodes["Post Code"])]
+ wates_coordination_sheet = wates_coordination_sheet.merge(
+ wates_postcodes[['Name', 'Post Code', 'Asset ID']].drop_duplicates(), how="left", on="Name"
+ )
+
+ wates_coordination_sheet["contractor"] = "Wates"
+ # Break into the different sites:
+ # Wiltshire
+ wates_coordination_sheet_wiltshere = wates_coordination_sheet.head(267)
+ wates_coordination_sheet_herefordshire = wates_coordination_sheet.iloc[271:332, :]
+ wates_coordination_sheet_coventry = wates_coordination_sheet.iloc[336:409, :]
+ wates_coordination_sheet_bedfordshire = wates_coordination_sheet.iloc[413:520, :]
+ wates_coordination_sheet_bournemouth = wates_coordination_sheet.iloc[524:567, :]
+ wates_coordination_sheet_cambridgeshire = wates_coordination_sheet.iloc[571:581, :]
+ wates_coordination_sheet_removed_from_programme = wates_coordination_sheet.iloc[586:926, :]
+ wates_coordination_sheet_abeyance = wates_coordination_sheet.iloc[930:972, :]
+
+ wates_coordination = pd.concat(
+ [
+ wates_coordination_sheet_wiltshere,
+ wates_coordination_sheet_herefordshire,
+ wates_coordination_sheet_coventry,
+ wates_coordination_sheet_bedfordshire,
+ wates_coordination_sheet_bournemouth,
+ wates_coordination_sheet_cambridgeshire,
+ wates_coordination_sheet_removed_from_programme,
+ wates_coordination_sheet_abeyance
+ ]
+ )
+ # We correct the Asset ID for 34 Kempster Close
+ wates_coordination["Asset ID"] = np.where(
+ wates_coordination["Name"] == "34 Kempster Close",
+ "12005",
+ wates_coordination["Asset ID"]
+ )
+
+ # We fill the missing ids
+ missing_lookup = {
+ "4 Sydnall Fields": 31231,
+ "12 Sydnall Fields": 31239,
+ "12 Athena Gardens": 28061,
+ "49 Banner Lane": 41189,
+ "4 Jonathan Road": 41232,
+ "8 Jonathan Road": 41236,
+ "1 Jonathan Road": 41229,
+ "96 Taunton Way": 31417,
+ "94 Taunton Way": 31418,
+ "1 Lady Lane": 29430,
+ "10 Jonathan Road": 41283,
+ "21 Jonathan Road": 41246,
+ "12 Ashcroft Close": 26399
+ }
+ for name, asset_id in missing_lookup.items():
+ wates_coordination["Asset ID"] = np.where(
+ wates_coordination["Name"] == name,
+ asset_id,
+ wates_coordination["Asset ID"]
+ )
+
+ wates_coordination = wates_coordination[~pd.isnull(wates_coordination["Asset ID"])]
+
+ wates_coordination["folder_path"] = wates_coordination["Sharepoint Folder"].apply(
+ lambda x: extract_sharepoint_url(x)
+ )
+
+ ############################################################
+ # NEW 450 COORDINATED RETROFIT ASSESSMENTS
+ #############################################################
+ features = pd.read_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - "
+ "master sheet.csv",
+ encoding='latin1'
+ )
+ features["Address ID"] = features["Address ID"].astype(str).astype(int)
+ features_to_merge = features[["Address ID", "Organisation Reference"]]
+
+ retrofit_packages_board = pd.read_excel(
+ os.path.join(
+ CUSTOMER_FOLDER_PATH,
+ "Stonewater_SHDF_3_0_Board_work_in_progress_-_Operations_1732034933 Final 19.11.24.xlsx"
+ ),
+ header=4
+ )
+ retrofit_packages_board = retrofit_packages_board[~pd.isnull(retrofit_packages_board["Name"])]
+ # Take just the rows that have been surveyed
+ retrofit_packages_board = retrofit_packages_board[
+ retrofit_packages_board["RA"].isin(["Invoiced", "Completed"])
+ ]
+
+ retrofit_packages_board = retrofit_packages_board.merge(
+ features_to_merge, how="left", on="Address ID"
+ )
+
+ manual_filters = {
+ "Flat 21 Walmer Street": "StonewaterSurveys_14/91-1-Flat 21 Walmer Street-HR4 9JD",
+ "6 Cornewall Close": "StonewaterSurveys_14/aa 6, Cornewall Close, Moccas, HEREFORD, HR2 9LG",
+ "2 Bromyard Road": "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ",
+ 'Flat 18, 1 Raglan Court': "StonewaterSurveys_13/60-3-18 Raglan Court, 1 Raglan Court-MK41 8QT",
+ '14 Raglan Court, 1 Devizes Avenue': 'StonewaterSurveys_12/55-3-14 Raglan Court, Devizes Avenue-MK41 8QT',
+ '19 South Road': 'StonewaterSurveys_4/19 The Oaks, South Road, SMETHWICK, B67 7BY',
+ 'Flat 12 Pelican Lane': 'StonewaterSurveys_1/121-3-Flat 12 Lynton Court, Pelican Lane-RG14 1NN',
+ 'Flat C, 44 St Leonards Avenue': 'StonewaterSurveys_11/427-2-44c St. Leonards Avenue-MK42 0RB',
+ '16 The Crescent, Kington': 'StonewaterSurveys_9/360-3-16 The Crescent-HR5 3AS',
+ '2 School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA',
+ '14 South Road': 'StonewaterSurveys_2/14 The Oaks, South Road, SMETHWICK, B67 7BY',
+ '1 Groves Street': 'StonewaterSurveys_4/19-5-1 Groves Street-SN2 2BW',
+ '2 Calshot Walk': 'StonewaterSurveys_3/156-3-2 Calshot Walk-MK41 8QS',
+ '21 Constitution Hill': 'StonewaterSurveys_1/112-11-21 Constitution Hill-BH14 0PX',
+ '22 Constitution Hill': 'StonewaterSurveys_4/185-8-22 Constitution Hill-BH14 0PX',
+ '2 Marches Cottages, School Lane, Leominster': 'StonewaterSurveys_5/224-1-2 School Lane-HR6 8AA',
+ '26, Copthorn House, Brighton Road': 'StonewaterSurveys_15/133-1-26 Brighton Road-KT20 6BQ',
+ '4, Old St Marys, Ripley Lane': "StonewaterSurveys_15/433-3-4 Ripley Lane-KT24 6JG",
+ '1 Nelson House, Short Street': 'StonewaterSurveys_15/89-2-1 Short Street-GU11 1HX',
+ "18 Nelson House, Short Street": 'StonewaterSurveys_15/25-3- 18 Short Street- GU11 1HX',
+ '3 Nelson House, Short Street': 'StonewaterSurveys_2/138-1-3 Short Street-GU11 1HX',
+ '16, Copthorn House, Brighton Road': 'StonewaterSurveys_13/78-3-16 Brighton Road-KT20 6BQ',
+ '20 Nelson House, Short Street': 'StonewaterSurveys_15/89-1-20 Short Street-GU11 1HX',
+ '7 Croft Street': 'StonewaterSurveys_8/333-2-7 Croft Street-HR6 8LA'
+ }
+
+ # We now match this retrofit packages board to the extracted data
+ matching_lookup = []
+ for _, home in tqdm(retrofit_packages_board.iterrows(), total=len(retrofit_packages_board)):
+
+ # Handle the case that has the wrong postcode in the asset data
+ if home["Name"] in manual_filters:
+ filtered = retrofit_assessment_data[
+ retrofit_assessment_data["survey_folder"] == manual_filters[home["Name"]]
+ ].copy()
+ else:
+ filtered = retrofit_assessment_data[
+ retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower()
+ ].copy()
+
+ # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
+ to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
+ home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False
+ )
+ if to_filter.sum() == 0:
+ to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.replace(",", "").str.replace(".",
+ "").str.contains(
+ home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False
+ )
+ filtered = filtered[to_filter]
+
+ if filtered.empty:
+ continue
+
+ if filtered.shape[0] == 1:
+ matching_lookup.append(
+ {
+ "survey_folder": filtered["survey_folder"].values[0],
+ "Address ID": home["Address ID"],
+ "Name": home["Name"]
+ }
+ )
+ continue
+
+ # home["Name"] should be contained in the survey_folder
+ filtered = filtered[filtered["survey_folder"].str.contains(home["Name"], case=False)]
+ # We have an edge case wher some properties have two outputs in Sharepoint
+ if home["Name"] == "197 Granby Court" and home["Postcode"] == "MK1 1NQ":
+ raise Exception("Fix me1")
+ # filtered = filtered[filtered["survey_folder"] == "113-1-197 Granby Court-MK1 1NQ"]
+
+ if home["Name"] == '1 Cluny Way' and home["Postcode"] == 'SG15 6ZB':
+ raise Exception("Fix me2")
+ # filtered = filtered[filtered["survey_folder"] == "12-1-1 Cluny Way-SG15 6ZB"]
+
+ if home["Name"] == '2 Bromyard Road' and home["Postcode"] == 'WR15 8BZ':
+ filtered = filtered[filtered["survey_folder"] == "StonewaterSurveys_4/192-9-2 Bromyard Road-WR15 8BZ"]
+
+ if filtered.empty:
+ continue
+ if filtered.shape[0] != 1:
+ raise Exception("something went wrong")
+
+ matching_lookup.append(
+ {
+ "survey_folder": filtered["survey_folder"].values[0],
+ "Address ID": home["Address ID"],
+ "Name": home["Name"]
+ }
+ )
+ matching_lookup = pd.DataFrame(matching_lookup)
+
+ ccs_coordination = ccs_coordination.rename(
+ columns={"Post Code": "Postcode"}
+ )
+ ccs_coordination = ccs_coordination[~pd.isnull(ccs_coordination["Postcode"])]
+ ccs_coordination = ccs_coordination[ccs_coordination["Retrofit Assessment"] != "Outstanding"]
+
+ ccs_manual_filters = {
+ "35 Kittiwake Close": f"{wave_21_folder_name}/11. CCS Dorset/Kittiwake Close 35"
+ }
+ ccs_matching_lookup = []
+ for _, home in tqdm(ccs_coordination.iterrows(), total=len(ccs_coordination)):
+
+ # Handle the case that has the wrong postcode in the asset data
+ if home["Name"] in ccs_manual_filters:
+ filtered = retrofit_assessment_data[
+ retrofit_assessment_data["survey_folder"] == ccs_manual_filters[home["Name"]]
+ ].copy()
+ else:
+ filtered = retrofit_assessment_data[
+ retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower()
+ ].copy()
+
+ # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
+ to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
+ home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False
+ )
+ if to_filter.sum() == 0:
+ to_filter = (
+ filtered["survey_folder"].
+ str.replace(r"[^\w\s]", "").
+ str.replace(",", "").
+ str.replace(".", "").
+ str.contains(
+ home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False
+ )
+ )
+ if to_filter.sum() == 0:
+ to_filter = (
+ filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").str.lower() ==
+ home["Name"].lower()
+ )
+ if to_filter.sum() == 0:
+ to_filter = (
+ filtered["Address"].str.replace(" ,", "").str.split(",").str[0:1].str.join("").str.lower() ==
+ home["Name"].lower()
+ )
+ if to_filter.sum() == 0:
+ # Do a fuzzy match on the name
+ # Find the best filter
+ to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").apply(
+ lambda x: fuzz.partial_ratio(home["Name"], x) > 93
+ )
+ if to_filter.sum() == 0:
+ # We also some cases where the name of the survey folder is like "Colville Road 7" and the
+ # property name is actually 7 Colville Road, so we try taking the final part of the address,
+ # splitting on space, and adding it to the front
+ def reformat_survey_folder(x):
+ filename = x.split("/")[-1]
+ parts = filename.split(" ")
+ return " ".join(parts[-1:] + parts[:-1])
+
+ to_filter = (
+ filtered["survey_folder"].apply(lambda x: reformat_survey_folder(x)).str.lower() ==
+ home["Name"].lower()
+ )
+
+ if to_filter.sum() == 0:
+ raise Exception("Error")
+ filtered = filtered[to_filter]
+
+ if filtered.empty:
+ continue
+
+ if filtered.shape[0] == 1:
+ ccs_matching_lookup.append(
+ {
+ "survey_folder": filtered["survey_folder"].values[0],
+ "Asset ID.1": home["Asset ID.1"],
+ "Name": home["Name"]
+ }
+ )
+ continue
+
+ raise Exception("No match")
+
+ ccs_matching_lookup = pd.DataFrame(ccs_matching_lookup)
+ # We get a match for all records
+ assert ccs_matching_lookup.shape[0] == ccs_coordination.shape[0]
+ assert not pd.isnull(ccs_matching_lookup["Asset ID.1"]).sum()
+ assert not ccs_matching_lookup["Asset ID.1"].duplicated().sum()
+
+ # We do the same for Wates
+ wates_coordination = wates_coordination.rename(
+ columns={"Post Code": "Postcode"}
+ )
+ wates_coordination = wates_coordination[
+ wates_coordination["Retrofit Assessment"].isin(["Completed"])
+ ]
+ wates_coordination = wates_coordination[
+ ~pd.isnull(wates_coordination["Postcode"])
+ ]
+
+ wates_manual_filters = {
+ "24 Rabley Wood View": f"{wave_21_folder_name}/3. Wiltshire/24-25 Rabley Wood View",
+ "14 Edencroft": f"{wave_21_folder_name}/3. Wiltshire/14 Edencroft",
+ "Flat 31 Rabley Wood View": f"{wave_21_folder_name}/3. Wiltshire/Flat 31 Rabley Wood View",
+ 'Flat 13, Manor Fields': f'{wave_21_folder_name}/1. Herefordshire/(038) Manor Fields Flat 13',
+ "4 Kittys Lane": f"{wave_21_folder_name}/1. Herefordshire/(005) Kittys Lane 4",
+ '1 Jephson Court': f'{wave_21_folder_name}/5. Coventry/Jesphson Court 1',
+ '2 Jephson Court': f'{wave_21_folder_name}/5. Coventry/Jesphson Court 2',
+ }
+ wates_matching_lookup = []
+ # Examples to skip when we cannot get the data
+ wates_to_skip = [
+ "66 Abbatt Close", # File type is unusual, couldn't extract the data
+ "Flat 69 Goddard Road", # Doesn't exist
+ "19 Garth House", # # File type is unusual, couldn't extract the data
+ '5 Gilpin Close', # No properly formatted EPR
+ '49 The Hide, Netherfield', # TODO: TEMP HERE
+ '19 Chanders Rd',
+ '5 Chanders Rd',
+ '23 Chanders Rd',
+ '3 Chanders Rd',
+ '1 Orchard Close',
+ ]
+ wates_coordination = wates_coordination[~wates_coordination["Name"].isin(wates_to_skip)]
+
+ for _, home in tqdm(wates_coordination.iterrows(), total=len(wates_coordination)):
+
+ # Search the folder
+ filtered = retrofit_assessment_data[
+ retrofit_assessment_data["survey_folder"].str.contains(home["folder_path"], regex=False)
+ ]
+ if len(filtered) == 1:
+ wates_matching_lookup.append(
+ {
+ "survey_folder": filtered["survey_folder"].values[0],
+ "Asset ID": home["Asset ID"],
+ "Name": home["Name"]
+ }
+ )
+ continue
+
+ if home["Name"] in wates_to_skip:
+ continue
+
+ # Handle the case that has the wrong postcode in the asset data
+ if home["Name"] in wates_manual_filters:
+ filtered = retrofit_assessment_data[
+ retrofit_assessment_data["survey_folder"] == wates_manual_filters[home["Name"]]
+ ].copy()
+ else:
+ filtered = retrofit_assessment_data[
+ retrofit_assessment_data["Postcode"].str.lower() == home["Postcode"].lower()
+ ].copy()
+
+ # We check that home["Name"] is contained in the survey_folder, after removing punctuation and spaces
+ to_filter = filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.contains(
+ home["Name"].replace(r"[^\w\s]", "").replace("Flat", "").lstrip(), case=False
+ )
+
+ if to_filter.sum() > 1:
+ to_filter = (
+ filtered["survey_folder"].str.replace(r"[^\w\s]", "").str.split("/").str[-1].str.lower() ==
+ home["Name"].replace(r"[^\w\s]", "").lstrip().lower()
+ )
+
+ if to_filter.sum() == 0:
+ to_filter = (
+ filtered["survey_folder"].
+ str.replace(r"[^\w\s]", "").
+ str.replace(",", "").
+ str.replace(".", "").
+ str.contains(
+ home["Name"].replace(r"[^\w\s]", "").replace(",", ""), case=False
+ )
+ )
+ if to_filter.sum() == 0:
+ to_filter = (
+ filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").str.lower() ==
+ home["Name"].lower()
+ )
+ if to_filter.sum() == 0:
+ to_filter = (
+ filtered["Address"].str.replace(" ,", "").str.split(",").str[0:1].str.join("").str.lower() ==
+ home["Name"].lower()
+ )
+ if to_filter.sum() == 0:
+ # Do a fuzzy match on the name
+ # Find the best filter
+ to_filter = filtered["Address"].str.replace(" ,", "").str.split(",").str[0:2].str.join("").apply(
+ lambda x: fuzz.partial_ratio(home["Name"], x) > 93
+ )
+ if to_filter.sum() == 0:
+ # We also some cases where the name of the survey folder is like "Colville Road 7" and the
+ # property name is actually 7 Colville Road, so we try taking the final part of the address,
+ # splitting on space, and adding it to the front
+ def reformat_survey_folder(x):
+ filename = x.split("/")[-1]
+ parts = filename.split(" ")
+ return " ".join(parts[-1:] + parts[:-1])
+
+ to_filter = (
+ filtered["survey_folder"].apply(lambda x: reformat_survey_folder(x)).str.lower() ==
+ home["Name"].lower()
+ )
+
+ if to_filter.sum() == 0:
+ raise Exception("Error")
+ filtered = filtered[to_filter]
+
+ if filtered.empty:
+ continue
+
+ if filtered.shape[0] == 1:
+ wates_matching_lookup.append(
+ {
+ "survey_folder": filtered["survey_folder"].values[0],
+ "Asset ID": home["Asset ID"],
+ "Name": home["Name"]
+ }
+ )
+ continue
+
+ raise Exception("No match")
+ wates_matching_lookup = pd.DataFrame(wates_matching_lookup)
+
+ # We get a match for all records
+ assert wates_matching_lookup.shape[0] == wates_coordination.shape[0]
+ assert not pd.isnull(wates_matching_lookup["Asset ID"]).sum()
+ assert not wates_matching_lookup["Asset ID"].duplicated().sum()
+
+ # Merge lookup tables onto the coordination sheets
+ wates_coordination = wates_coordination.merge(
+ wates_matching_lookup, how="left", on="Name"
+ )
+ missed_asset_id = wates_coordination[pd.isnull(wates_coordination["Asset ID_x"])]
+ if not missed_asset_id.empty:
+ raise Exception("Missing Asset ID")
+
+ if wates_coordination["Asset ID_x"].duplicated().sum():
+ raise Exception("Duplicated IDs in wates")
+
+ # We merge the mpt data on to the wates coordination
+ wates_coordination = wates_coordination.merge(
+ mtp_df, how="left", on="survey_folder"
+ )
+
+ ccs_coordination = ccs_coordination.merge(
+ ccs_matching_lookup, how="left", on="Name"
+ )
+ ccs_coordination = ccs_coordination.merge(
+ mtp_df, how="left", on="survey_folder"
+ )
+
+ retrofit_packages_board = retrofit_packages_board.merge(
+ matching_lookup, how="left", on="Name"
+ )
+
+ # We now map the retrofit assessment data to the coordinated packages
+ wates_coordination = wates_coordination.merge(
+ retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder"
+ )
+ ccs_coordination = ccs_coordination.merge(
+ retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder"
+ )
+ retrofit_packages_board = retrofit_packages_board.merge(
+ retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder"
+ )
+
+ # We have 4 properties in the Wates coordination board, that we want to remove from the retrofit packages board
+ to_remove = wates_coordination[
+ wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"])
+ ]
+ assert to_remove.shape[0] == 4
+ # Remove them from the wates board
+ wates_coordination = wates_coordination[
+ ~wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"])
+ ]
+
+ # We combine this into a singular board
+ coordinated_packages = pd.concat(
+ [
+ retrofit_packages_board[
+ [
+ "Name", "Postcode", 'Actual SAP Band', 'Actual SAP Rating',
+ 'Modelled SAP Band', 'Modelled SAP Rating', 'Package Ref',
+ 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation',
+ 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade',
+ 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls',
+ 'Solar PV', 'Other measures', 'Organisation Reference',
+ ] + retrofit_assessments_data_columns_prefixed
+ ],
+ ccs_coordination[
+ [
+ # We don't have secondary wall insulation, Flat Roof, RIR, Heating Controls,
+ # Solar PV
+ "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package',
+ 'SAP Band Install Package', 'Package Approved (Client)',
+ 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade',
+ 'Ventilation', 'Heating', 'Other Measures', 'PV System',
+ "Asset ID.1_y",
+ ] + retrofit_assessments_data_columns_prefixed
+ ].rename(
+ columns={
+ "SAP Band Pre": "Actual SAP Band",
+ "SAP Rating Pre": "Actual SAP Rating",
+ 'SAP Rating Install Package': 'Modelled SAP Band',
+ 'SAP Band Install Package': 'Modelled SAP Rating',
+ 'Package Approved (Client)': 'Package Ref',
+ 'Wall Insulation': 'Main Wall Insulation',
+ 'Loft Insulation': 'Loft insulation',
+ 'Windows Upgrade': 'Window Upgrade',
+ 'Ext. Doors Upgrade': 'Door Upgrade',
+ 'Heating': 'Main Heating',
+ 'Other Measures': 'Other measures',
+ 'Asset ID.1_y': 'Organisation Reference',
+ "PV System": "Solar PV",
+ }
+ ),
+ wates_coordination[
+ [
+ "Name", "Postcode", 'SAP Band Pre', 'SAP Rating Pre', 'SAP Rating Install Package',
+ 'SAP Band Install Package', 'Package Approved (Client)',
+ 'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade',
+ 'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x', "PV System"
+ ] + retrofit_assessments_data_columns_prefixed
+ ].rename(
+ columns={
+ "SAP Band Pre": "Actual SAP Band",
+ "SAP Rating Pre": "Actual SAP Rating",
+ 'SAP Rating Install Package': 'Modelled SAP Band',
+ 'SAP Band Install Package': 'Modelled SAP Rating',
+ 'Package Approved (Client)': 'Package Ref',
+ 'Wall Insulation': 'Main Wall Insulation',
+ 'Loft Insulation': 'Loft insulation',
+ 'Windows Upgrade': 'Window Upgrade',
+ 'Ext. Doors Upgrade': 'Door Upgrade',
+ 'Heating': 'Main Heating',
+ 'Other Measures': 'Other measures',
+ 'Asset ID_x': 'Organisation Reference',
+ "PV System": "Solar PV",
+ }
+ )
+ ]
+ )
+
+ coordinated_packages["Organisation Reference"] = coordinated_packages["Organisation Reference"].astype(int)
+ assert not coordinated_packages["Organisation Reference"].duplicated().sum()
+
+ # Merge the property features on
+ coordinated_packages = coordinated_packages.merge(
+ features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]],
+ how="left",
+ on="Organisation Reference"
+ )
+
+ coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Survey: Current EPC Band"])]
+ coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Survey: Current SAP Rating"])]
+
+ # We need the features pertaining to these priority postcodes
+
+ def find_nearest_matching_property(coordinated_packages, home):
+ filter_levels = [
+ (["Postcode", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 2),
+ (["Postal Region", "Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 3),
+ (["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 4),
+ (["Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 5),
+ (["Primary Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"], 6),
+ (["Primary Property Type", "Walls", "Roof Simple", "Heating", "Main Fuel", "Age"], 7),
+ ]
+
+ max_confidence = max([confidence for (_, confidence) in filter_levels])
+
+ for i, (filters, match_confidence) in enumerate(filter_levels):
+ match = coordinated_packages.copy()
+
+ for col in filters:
+ match = match[match[col] == home[col]]
+
+ if not match.empty:
+ return match, match_confidence
+
+ # Finally, we search for a property in the same Archetype
+ match = coordinated_packages[coordinated_packages["Archetype ID"] == home["Archetype ID"]]
+ if not match.empty:
+ return match, max_confidence + 1
+
+ return None, None # No match found
+
+ coordinated_packages["Postal Region"] = coordinated_packages["Postcode"].str.split(" ").str[0].str.strip()
+ new_priority_postcodes["Postal Region"] = new_priority_postcodes["Postcode"].str.split(" ").str[0].str.strip()
+
+ coordinated_packages["Roof Simple"] = coordinated_packages["Roofs"].str.split(":").str[0].str.strip()
+ new_priority_postcodes["Roof Simple"] = new_priority_postcodes["Roofs"].str.split(":").str[0].str.strip()
+
+ coordinated_packages["Primary Property Type"] = coordinated_packages["Property Type"].str.split(":").str[0]
+ new_priority_postcodes["Primary Property Type"] = new_priority_postcodes["Property Type"].str.split(":").str[0]
+
+ coordinated_packages = coordinated_packages.merge(
+ new_priority_postcodes[["Organisation Reference", "Archetype ID"]],
+ how="left",
+ on="Organisation Reference"
+ )
+
+ # For every property in the priority postcodes data, we look for a most appropriate matching property
+ no_match = []
+ matches = []
+ for _, home in tqdm(new_priority_postcodes.iterrows(), total=len(new_priority_postcodes)):
+ # We check if the property was surveyed
+ survey_result = coordinated_packages[
+ coordinated_packages["Organisation Reference"] == home["Organisation Reference"]
+ ]
+ if not survey_result.empty:
+ to_extend = [
+ {
+ "Organisation Reference": home["Organisation Reference"],
+ "Best Match Organisation Reference": m,
+ "match_confidence": 1,
+ "Was Surveyed": True
+ } for m in survey_result["Organisation Reference"].values
+ ]
+ matches.extend(to_extend)
+ continue
+
+ closest_match, match_confidence = find_nearest_matching_property(coordinated_packages, home)
+ if closest_match is None:
+ no_match.append(home["Organisation Reference"])
+ continue
+
+ to_extend = [
+ {
+ "Organisation Reference": home["Organisation Reference"],
+ "Best Match Organisation Reference": m,
+ "match_confidence": match_confidence,
+ "Was Surveyed": False
+ } for m in closest_match["Organisation Reference"].values
+ ]
+ matches.extend(to_extend)
+
+ no_match_summary = new_priority_postcodes[
+ new_priority_postcodes["Organisation Reference"].isin(
+ no_match
+ )
+ ].groupby(["Property Type", "Walls", "Roofs", "Heating", "Main Fuel", "Age"])[
+ "Organisation Reference"].count().reset_index()
+
+ no_match_summary = no_match_summary.sort_values("Organisation Reference", ascending=False)
+
+ # len(no_match)
+ # 8764, 5607, 5646, 5071
+ # no_match_summary.shape
+ # (3953, 6), (2948, 6), (2969, 7), (2575, 7)
+
+ matches_df = pd.DataFrame(matches)
+
+ matches_df = matches_df.merge(
+ coordinated_packages[["Organisation Reference", "Survey: Current EPC Band", "Survey: Current SAP Rating"]],
+ left_on="Best Match Organisation Reference", right_on="Organisation Reference",
+ suffixes=("", " - Closest Match")
+ )
+
+ measures_columns = [
+ 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation',
+ 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade',
+ 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls',
+ 'Solar PV', 'Other measures'
+ ]
+
+ # We want to aggregate the matches, when we have multiple
+ aggregated_matches_df = []
+ for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"):
+
+ measures = coordinated_packages[
+ (
+ coordinated_packages["Organisation Reference"].isin(
+ mapped_matches['Best Match Organisation Reference'].values
+ )
+ )
+ ][measures_columns]
+
+ if mapped_matches.shape[0] == 1:
+ # Get the measures for this property
+ measures = measures.squeeze()
+
+ aggregated_matches_df.append(
+ {
+ "Organisation Reference": org_ref,
+ "Number of matches": 1,
+ "Proportion": 100,
+ "Estimated SAP Rating": mapped_matches["Survey: Current SAP Rating"].values[0],
+ "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0],
+ "Was Surveyed": mapped_matches["Was Surveyed"].values[0],
+ **measures
+ }
+ )
+ continue
+
+ # We need to aggregate the matches, since we have multiple
+ average_rating = mapped_matches["Survey: Current SAP Rating"].mean()
+ number_of_matches = mapped_matches.shape[0]
+ average_epc_rating = sap_to_epc(average_rating)
+ # proportion is the number of properties that have this EPC rating
+ proportion_with_this_epc = int(
+ mapped_matches[mapped_matches["Survey: Current EPC Band"] == average_epc_rating].shape[
+ 0] / number_of_matches * 100
+ )
+
+ measures_aggregated = {}
+ for m in measures_columns:
+ if any(~pd.isnull(measures[m])):
+ # Check if we have 2 unique values
+ vals = measures[~pd.isnull(measures[m])][m].unique()
+ if len(vals) > 1:
+ measures_aggregated[m] = ", ".join(vals)
+ else:
+ measures_aggregated[m] = vals[0]
+
+ aggregated_matches_df.append(
+ {
+ "Organisation Reference": org_ref,
+ "Number of matches": number_of_matches,
+ "Proportion": proportion_with_this_epc,
+ "Estimated SAP Rating": average_rating,
+ "Estimated EPC Rating": average_epc_rating,
+ "Was Surveyed": False,
+ **measures_aggregated
+ }
+ )
+
+ aggregated_matches_df = pd.DataFrame(aggregated_matches_df)
+
+ mapped_priority_list = new_priority_postcodes.merge(
+ aggregated_matches_df, on="Organisation Reference", how="left"
+ )
+
+ mapped_priority_list["address1"] = mapped_priority_list["Address"].str.split(",").str[0]
+
+ # If we have a leading number like 01, 02, 03, 04, 05, 06, 07, 08, 09, we remove the leading 0
+
+ def remove_leading_zero(address):
+ return re.sub(r"^0([1-9]) ", r"\1 ", address)
+
+ mapped_priority_list["address1"] = mapped_priority_list["address1"].apply(remove_leading_zero)
+ mapped_priority_list["address1"] = np.where(
+ mapped_priority_list["Organisation Reference"] == 37004,
+ "8 Mason Road",
+ mapped_priority_list["address1"]
+ )
+ mapped_priority_list["address1"] = np.where(
+ mapped_priority_list["Organisation Reference"] == 37003,
+ "9 Mason Road",
+ mapped_priority_list["address1"]
+ )
+
+ mapped_priority_list = mapped_priority_list.rename(
+ columns={"UPRN": "uprn"}
+ )
+ mapped_priority_list["row_id"] = mapped_priority_list["Organisation Reference"]
+
+ # Flag where 2 out of the three columns have consensus
+ mapped_priority_list["2 of 3 Data Sources Have Consensus on EPC"] = (
+ (mapped_priority_list["SAP Band"] == mapped_priority_list["EPC Band"]) |
+ (mapped_priority_list["SAP Band"] == mapped_priority_list["Estimated EPC Rating"]) |
+ (mapped_priority_list["EPC Band"] == mapped_priority_list["Estimated EPC Rating"])
+ )
+
+ # Let's get the newest EPC data for these properties
+ # We merge on UPRN, when we have it
+ # from etl.route_march_data_pull.app import get_data
+ # epc_data, errors, nodata = get_data(
+ # asset_list=mapped_priority_list,
+ # fulladdress_column="Address",
+ # address1_column="address1",
+ # postcode_column="Postcode",
+ # manual_uprn_map={},
+ # epc_api_only=True
+ # )
+ #
+ # epc_df = pd.DataFrame(epc_data)
+ # epc_df.to_csv(
+ # os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "full_epc_data.csv"), index=False
+ # )
+ epc_df = pd.read_csv(os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "full_epc_data.csv"))
+ epc_df = epc_df.rename(columns={"row_id": "Organisation Reference"})
+
+ # We now package up the data
+
+ # Sheet 1 is the base coordination data
+ output_coordination_sheet = coordinated_packages[
+ [
+ "Name", "Postcode", 'Organisation Reference', 'Package Ref',
+ 'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation',
+ 'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade',
+ 'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls',
+ 'Solar PV', 'Other measures',
+ 'Survey: Current SAP Rating',
+ 'Survey: Current EPC Band',
+ 'Survey: Primary Energy Use (kWh/yr)',
+ 'Survey: Primary Energy Use Intensity (kWh/m2/yr)',
+ 'Survey: Number of Storeys', 'Survey: Fuel Bill',
+ 'Survey: Window Age Description',
+ 'Survey: Window Age Description Proportion (%)',
+ 'Survey: Secondary Window Age Description',
+ 'Survey: Secondary Window Age Description Proportion (%)',
+ 'Survey: Number of Windows', 'Survey: Total Number of Doors',
+ 'Survey: Number of Insulated Doors',
+ 'Survey: Existing Primary Heating System',
+ 'Survey: Existing Primary Heating PCDF Reference',
+ 'Survey: Existing Primary Heating Controls',
+ 'Survey: Existing Primary Heating % of Heat',
+ 'Survey: Existing Secondary Heating System',
+ 'Survey: Existing Secondary Heating PCDF Reference',
+ 'Survey: Existing Secondary Heating Controls',
+ 'Survey: Existing Secondary Heating % of Heat',
+ 'Survey: Secondary Heating Code', 'Survey: Water Heating Code',
+ 'Survey: Total Floor Area (m2)', 'Survey: Total Ground Floor Area (m2)',
+ 'Survey: RIR Floor Area', 'Survey: Main Building Wall Area (m2)',
+ 'Survey: First Extension Wall Area (m2)',
+ 'Survey: Number of Light Fittings', 'Survey: Number of LEL Fittings',
+ 'Survey: Number of fittings needing LEL', 'Survey: Main Roof Type',
+ 'Survey: Main Roof Insulation',
+ 'Survey: Main Roof Insulation Thickness', 'Survey: Main Wall Type',
+ 'Survey: Main Wall Insulation', 'Survey: Main Wall Dry-lining',
+ 'Survey: Main Wall Thickness',
+ 'Survey: Main Building Alternative Wall Type',
+ 'Survey: Main Building Alternative Wall Insulation',
+ 'Survey: Main Building Alternative Wall Dry-lining',
+ 'Survey: Main Building Alternative Wall Thickness',
+ 'Survey: Main Fuel',
+ 'Survey: Main Building Age Band',
+ 'Walls', 'Roofs', 'Heating', 'Main Fuel', 'Age', 'Property Type'
+ ]
+ ].rename(
+ columns={
+ 'Walls': "Parity - Walls",
+ 'Roofs': "Parity - Roof",
+ 'Heating': "Parity - Heating",
+ 'Main Fuel': "Parity - Fuel",
+ 'Age': "Parity - Age Band",
+ 'Property Type': "Parity - Property Type"
+ }
+ )
+
+ # Sheet 2 is the lookup table which maps the properties to their closest match
+ # We need to bring in the parity attributes between the mapped properties so we can see side-by-side
+ mapped_lookup = matches_df[
+ [
+ 'Organisation Reference',
+ 'Best Match Organisation Reference',
+ 'Survey: Current EPC Band',
+ 'Survey: Current SAP Rating',
+ "Was Surveyed",
+ "match_confidence",
+ ]
+ ].rename(
+ columns={
+ 'Best Match Organisation Reference': "Best Match - Organisation Reference",
+ "Survey: Current EPC Band": "Best Match - Survey: Current EPC Band",
+ 'Survey: Current SAP Rating': "Best Match - Survey: Current SAp Rating"
+ }
+ ).merge(
+ features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type",
+ "Total Floor Area"]],
+ how="left",
+ on="Organisation Reference"
+ ).merge(
+ features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type",
+ "Total Floor Area"]].rename(
+ columns={
+ "Organisation Reference": "Best Match - Organisation Reference",
+ "Walls": "Best Match - Walls",
+ "Roofs": "Best Match - Roof",
+ "Heating": "Best Match - Heating",
+ "Main Fuel": "Best Match - Main Fuel",
+ "Age": "Best Match - Age",
+ "Property Type": "Best Match - Property Type",
+ "Total Floor Area": "Best Match - Total Floor Area"
+ }
+ ),
+ how="left",
+ on="Best Match - Organisation Reference"
+ ).merge(
+ coordinated_packages[
+ [
+ "Organisation Reference", 'Survey: Main Wall Type', 'Survey: Main Wall Insulation',
+ 'Survey: Main Roof Type', 'Survey: Main Roof Insulation', 'Survey: Main Roof Insulation Thickness',
+ 'Survey: Existing Primary Heating System', 'Survey: Main Building Age Band',
+ 'Survey: Main Building Wall Area (m2)', 'Survey: Total Floor Area (m2)',
+ 'Survey: Main Building Age Band',
+ ]
+ ].rename(
+ columns={
+ "Organisation Reference": "Best Match - Organisation Reference",
+ 'Survey: Main Wall Type': 'Best Match - Survey: Main Wall Type',
+ 'Survey: Main Wall Insulation': 'Best Match - Survey: Main Wall Insulation',
+ 'Survey: Main Roof Type': 'Best Match - Survey: Main Roof Type',
+ 'Survey: Main Roof Insulation': 'Best Match - Survey: Main Roof Insulation',
+ 'Survey: Main Roof Insulation Thickness': 'Best Match - Survey: Main Roof Insulation Thickness',
+ 'Survey: Existing Primary Heating System': 'Best Match - Survey: Existing Primary Heating System',
+ }
+ ),
+ how="left",
+ on="Best Match - Organisation Reference"
+ )
+
+ # Finally, we have the property, against the mapped home with the estimate SAP scores and the EPC data
+ worksheet = mapped_priority_list[
+ [
+ 'Organisation Reference', 'Address', 'Postcode', 'Address ID', 'uprn', 'Archetype ID',
+ 'SAP', 'SAP Band', "Property Type", "Walls", "Roofs", 'Glazing',
+ 'Heating', 'Main Fuel', 'Hot Water', 'Number of matches', 'Proportion',
+ 'Estimated SAP Rating', 'Estimated EPC Rating', "Was Surveyed",
+ 'Main Wall Insulation',
+ 'Secondary Wall Insulation', 'Loft insulation', 'Flat Roof',
+ 'Room in Roof', 'Window Upgrade', 'Door Upgrade', 'Ventilation',
+ 'Main Heating', 'Water Heating', 'Heating Controls', 'Solar PV',
+ 'Other measures', "2 of 3 Data Sources Have Consensus on EPC"
+ ]
+ ].rename(
+ columns={
+ "SAP": "Parity - SAP Rating",
+ "SAP Band": "Parity - EPC Rating",
+ "Property Type": "Parity - Property Type",
+ "Walls": "Parity - Walls",
+ "Roofs": "Parity - Roofs",
+ 'Glazing': "Parity - Glazing",
+ 'Heating': 'Parity - Heating',
+ 'Main Fuel': 'Parity - Main Fuel',
+ 'Hot Water': 'Parity - Hot Water',
+ 'Proportion': 'Proportion of matched properties with same EPC rating',
+ }
+ ).merge(
+ epc_df[
+ [
+ "Organisation Reference",
+ "uprn",
+ "current-energy-efficiency",
+ "current-energy-rating",
+ "lodgement-date",
+ "construction-age-band",
+ "walls-description",
+ "roof-description",
+ "mainheat-description",
+ "windows-description",
+ "hotwater-description",
+ "main-fuel",
+ "total-floor-area",
+ ]
+ ].rename(
+ columns={
+ "uprn": "Last EPC - uprn",
+ "current-energy-efficiency": "Last EPC - SAP Score",
+ "current-energy-rating": "Last EPC - EPC Rating",
+ "lodgement-date": "Last EPC - Date Lodged",
+ "construction-age-band": "Last EPC - Age Band",
+ "walls-description": "Last EPC - Walls",
+ "roof-description": "Last EPC - Roof",
+ "mainheat-description": "Last EPC - Heating",
+ "windows-description": "Last EPC - Windows",
+ "hotwater-description": "Last EPC - Hot Water",
+ "main-fuel": "Last EPC - Main Fuel",
+ "total-floor-area": "Last EPC - Total Floor Area"
+ }
+ ),
+ how="left",
+ on='Organisation Reference'
+ )
+
+ worksheet["Years Since Last EPC"] = pd.Timestamp.now().year - pd.to_datetime(
+ worksheet["Last EPC - Date Lodged"]).dt.year
+
+ worksheet["Last EPC - uprn"] = worksheet["Last EPC - uprn"].astype("Int64").astype(str)
+
+ worksheet["uprn"] = np.where(
+ pd.isnull(worksheet["uprn"]) & pd.notnull(worksheet["Last EPC - uprn"]),
+ worksheet["Last EPC - uprn"],
+ worksheet["uprn"]
+ )
+
+ worksheet["uprn"] = worksheet["uprn"].replace("", "")
+
+ worksheet = worksheet.drop(columns=["Last EPC - uprn"])
+
+ # Save to Excel with multiple sheets
+ excel_path = os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "13022025 Stonewater Priority List.xlsx")
+ with pd.ExcelWriter(excel_path, engine="xlsxwriter") as writer:
+ worksheet.to_excel(writer, sheet_name="Worksheet", index=False, header=True)
+ mapped_lookup.to_excel(writer, sheet_name="Lookup Table", index=False, header=True)
+ output_coordination_sheet.to_excel(writer, sheet_name="Coordination", index=False, header=True)
# if __name__ == "__main__":
# main()
diff --git a/etl/customers/stonewater/data_cleaning.py b/etl/customers/stonewater/data_cleaning.py
index 8751960c..a5da0c79 100644
--- a/etl/customers/stonewater/data_cleaning.py
+++ b/etl/customers/stonewater/data_cleaning.py
@@ -1,6 +1,7 @@
import os
import shutil
from tqdm import tqdm
+from etl.access_reporting.app import SharePointClient
def delete_large_files():
@@ -66,13 +67,17 @@ def delete_large_files():
def download_data_from_sharepoint():
# Given a sharepoint location, this function will download the retrofit assessment folders from the locations
# specified in the sharepoint location
- from etl.access_reporting.app import SharePointClient
+
+ SHAREPOINT_CLIENT_ID = os.getenv("SHAREPOINT_CLIENT_ID", None)
+ SHAREPOINT_CLIENT_SECRET = os.getenv("SHAREPOINT_CLIENT_SECRET", None)
+ SHAREPOINT_TENANT_ID = os.getenv("SHAREPOINT_TENANT_ID", None)
+ OSMOSIS_SHAREPOINT_SITE_ID = os.getenv("OSMOSIS_SHAREPOINT_SITE_ID", None)
sharepoint_client = SharePointClient(
- tenant_id="10d5af8b-2cfd-4882-9ccd-b96e4812dacf",
- client_id="6832a4c5-fb8c-4082-a746-4f51e1020f0d",
- client_secret="xpC8Q~Frww48SM1V-D8lGy5iOY7P_cJ7FF3jgarQ",
- site_id="bc925a9a-ad0b-4de9-9a3c-e61014cc7489"
+ tenant_id=SHAREPOINT_TENANT_ID,
+ client_id=SHAREPOINT_CLIENT_ID,
+ client_secret=SHAREPOINT_CLIENT_SECRET,
+ site_id=OSMOSIS_SHAREPOINT_SITE_ID
)
# Retrieve the data from Sharepoint and write to local machine
@@ -81,9 +86,14 @@ def download_data_from_sharepoint():
folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders"
)
- len(contents["value"])
+ folders_to_keep = [
+ "1. Herefordshire", "2. Bedfordshire", "3. Wiltshire", "4. Bournemouth",
+ "5. Coventry", "6. West Sussex", "7. Dorset", "8. Cambridgeshire",
+ "9. Guildford", "10. Little Island", "11. CCS Dorset",
+ ]
+
folders_to_pull = [
- folder for folder in contents["value"] if folder["name"] in ["3. Wiltshire", "4. Bournemouth", "5. Coventry"]
+ folder for folder in contents["value"] if folder["name"] in folders_to_keep
]
for folder_to_pull in folders_to_pull:
# Get the contents
@@ -103,35 +113,42 @@ def download_data_from_sharepoint():
folder_path="Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders" + "/" +
folder_to_pull["name"] + "/" + property_folder["name"]
)
- # We look for the retrofit assessment folder:
+ if not property_folder_contents.get("value"):
+ continue
+ # We look for the retrofit assessment folder or mtp folders:
property_sub_folders = [
- f for f in property_folder_contents["value"] if "ra coordinator info" in f["name"].lower()
+ f for f in property_folder_contents["value"] if
+ "ra coordinator info" in f["name"].lower() or
+ "retrofit assessment" in f["name"].lower() or
+ "ra info" in f["name"].lower() or
+ "mtp" in f["name"].lower() or
+ "mid-term" in f["name"].lower()
]
if not property_sub_folders:
continue
- # if we have this, we download the folder and store it on my laptop!
- property_sub_folder = property_sub_folders[0]
+ for property_sub_folder in property_sub_folders:
+ # if we have this, we download the folder and store it on my laptop!
- property_folder_path = os.path.join(
- "Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders",
- folder_to_pull["name"],
- property_folder["name"],
- property_sub_folder["name"]
- )
+ property_folder_path = os.path.join(
+ "Osmosis ACD/Osmosis ACD Projects/Stonewater/Stonewater Property ID Folders",
+ folder_to_pull["name"],
+ property_folder["name"],
+ property_sub_folder["name"]
+ )
- download_dir = os.path.join(
- "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 2.1 Surveys",
- folder_to_pull["name"],
- property_folder["name"],
- property_sub_folder["name"]
- )
+ download_dir = os.path.join(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 2.1 Surveys - 2",
+ folder_to_pull["name"],
+ property_folder["name"],
+ property_sub_folder["name"]
+ )
- # We download the folder
- sharepoint_client.download_sharepoint_folder(
- drive_id=sharepoint_client.document_drive["id"],
- folder_path=property_folder_path,
- download_dir=download_dir,
- excluded_file_types=["MOV"]
- )
+ # We download the folder
+ sharepoint_client.download_sharepoint_folder(
+ drive_id=sharepoint_client.document_drive["id"],
+ folder_path=property_folder_path,
+ download_dir=download_dir,
+ excluded_file_types=["MOV", "jpg"]
+ )
diff --git a/etl/customers/stonewater/potential_eco_properties.py b/etl/customers/stonewater/potential_eco_properties.py
index bda9c30c..6666ce15 100644
--- a/etl/customers/stonewater/potential_eco_properties.py
+++ b/etl/customers/stonewater/potential_eco_properties.py
@@ -217,78 +217,7 @@ def app():
)
)
- # We get the EPC data
- # epc_data = json.loads(
- # read_from_s3(
- # bucket_name="retrofit-data-dev",
- # s3_file_name="customers/Stonewater/clustering/epc_data.json"
- # )
- # )
- # epc_data = pd.DataFrame(epc_data)
- #
- # epc_data["uprn"] = np.where(
- # epc_data["internal_id"] == 1091,
- # 83143766,
- # epc_data["uprn"]
- # )
- #
- # epc_data_batch_2 = read_pickle_from_s3(
- # s3_file_name="customers/Stonewater/clustering/epc_data_batch_2.pkl",
- # bucket_name="retrofit-data-dev"
- # )
- # epc_data_batch_2 = pd.DataFrame(epc_data_batch_2)
- #
- # complete_epcs = pd.concat([epc_data, epc_data_batch_2])
- #
- # epcs_to_merge = complete_epcs[
- # [
- # "uprn",
- # "address",
- # "postcode",
- # "property-type",
- # "built-form",
- # "inspection-date",
- # "current-energy-rating",
- # "current-energy-efficiency",
- # "roof-description",
- # "walls-description",
- # "transaction-type",
- # "secondheat-description",
- # "total-floor-area",
- # "construction-age-band",
- # "floor-height",
- # "number-habitable-rooms",
- # "mainheat-description",
- # "energy-consumption-current"
- # ]
- # ].rename(
- # columns={
- # "address": "Address",
- # "postcode": "Postcode",
- # "inspection-date": "Date of last EPC",
- # "current-energy-efficiency": "SAP score on register",
- # "current-energy-rating": "EPC rating on register",
- # "property-type": "Property Type",
- # "built-form": "Archetype",
- # "total-floor-area": "Property Floor Area",
- # "construction-age-band": "Property Age Band",
- # "floor-height": "Property Floor Height",
- # "number-habitable-rooms": "Number of Habitable Rooms",
- # "walls-description": "Wall Construction",
- # "roof-description": "Roof Construction",
- # "mainheat-description": "Heating Type",
- # "secondheat-description": "Secondary Heating",
- # "transaction-type": "Reason for last EPC",
- # "energy-consumption-current": "Heat Demand (kWh/m2)",
- # }
- # )
- # # We de-dupe, taking the newest on the date the EPC was lod
- # epcs_to_merge["Date of last EPC"] = pd.to_datetime(epcs_to_merge["Date of last EPC"])
- # epcs_to_merge = epcs_to_merge.sort_values("Date of last EPC", ascending=False)
- # epcs_to_merge = epcs_to_merge.drop_duplicates(subset="uprn")
-
stonewater_cavity_properties["UPRN"] = stonewater_cavity_properties["UPRN"].astype("Int64").astype(str)
- stonewater_cavity_properties["Reason Included"].value_counts()
# Find the postcodes where an Osmosis survey revealed a need for CWI
postcodes_found_needing_cwi = stonewater_cavity_properties[
stonewater_cavity_properties["Reason Included"].isin(
@@ -339,12 +268,7 @@ def app():
"Renewables": "Parity - Renewables",
"Total Floor Area": "Parity - Total Floor Area"
}
- ) # .merge(
- # epcs_to_merge,
- # how="left",
- # left_on="UPRN",
- # right_on="uprn"
- # )
+ )
# We now flag the additional properties in the as built list
@@ -434,20 +358,20 @@ def app():
additional_properties["Suspected Needs CWI - not surveyed"] = (
(
- additional_properties["Postcode"].isin(postcodes_found_needing_cwi)
+ additional_properties["Postcode"].isin(postcodes_found_needing_cwi) &
+ ~additional_properties["Installed under ECO3"]
)
)
- additional_properties["Same Postcode as Installed under ECO3"].value_counts()
-
# We drop Full Address
additional_properties = additional_properties.drop(columns=["Full Address"])
additional_properties2 = additional_properties[[
"Address", "Postcode", "Address ID", "SAP", "SAP Band", "Property Type", "Walls", "Roofs", "Glazing",
"Heating", "Main Fuel", "Hot Water", "Renewables", "Total Floor Area", 'Installed under ECO3',
- 'Same Postcode as Installed under ECO3'
+ 'Same Postcode as Installed under ECO3', "Organisation Reference",
]].rename(
columns={
+ "Organisation Reference": "Org. ref.",
"SAP": "Parity - Predicted SAP",
"SAP Band": "Parity - Predicted SAP Band",
"Age": "Parity - Build Age",
@@ -461,65 +385,62 @@ def app():
"Renewables": "Parity - Renewables",
"Total Floor Area": "Parity - Total Floor Area"
}
- ) # .merge(
- # pd.DataFrame(additional_properties_epcs)[
- # [
- # "row_id",
- # "property-type",
- # "built-form",
- # "inspection-date",
- # "current-energy-rating",
- # "current-energy-efficiency",
- # "roof-description",
- # "walls-description",
- # "transaction-type",
- # "secondheat-description",
- # "total-floor-area",
- # "construction-age-band",
- # "floor-height",
- # "number-habitable-rooms",
- # "mainheat-description",
- # "energy-consumption-current"
- # ]
- # ].rename(
- # columns={
- # "inspection-date": "Date of last EPC",
- # "current-energy-efficiency": "SAP score on register",
- # "current-energy-rating": "EPC rating on register",
- # "property-type": "Property Type",
- # "built-form": "Archetype",
- # "total-floor-area": "Property Floor Area",
- # "construction-age-band": "Property Age Band",
- # "floor-height": "Property Floor Height",
- # "number-habitable-rooms": "Number of Habitable Rooms",
- # "walls-description": "Wall Construction",
- # "roof-description": "Roof Construction",
- # "mainheat-description": "Heating Type",
- # "secondheat-description": "Secondary Heating",
- # "transaction-type": "Reason for last EPC",
- # "energy-consumption-current": "Heat Demand (kWh/m2)",
- # }
- # ),
- # how="left",
- # on="row_id"
- # )
+ )
+
+ # Combine the data:
+
+ stonewater_cavity_properties2 = stonewater_cavity_properties.merge(
+ features[["Address", "Organisation Reference"]], how="left", on="Organisation Reference"
+ )
+ full_dataset = pd.concat([stonewater_cavity_properties2, additional_properties2])
+ full_dataset = full_dataset.drop(columns=['Osm. ID'])
+
+ # We not define the priority list for non-intrusives
+ full_dataset["Postal Region"] = full_dataset["Postcode"].str.split(" ").str[0].str[0:2]
+ full_dataset["Postal Region 2"] = full_dataset["Postcode"].str.split(" ").str[0]
+
+ # Strip out anything we definitely don't want
+ full_dataset = full_dataset[~full_dataset["Installed under ECO3"]]
+
+ areas = full_dataset[full_dataset["Suspected Needs CWI - not surveyed"] == True]["Postal Region 2"].unique()
+
+ priorities = full_dataset[
+ full_dataset["Postal Region 2"].isin(areas)
+ ]
+
+ region_prevalance = priorities["Postal Region 2"].value_counts().to_frame().reset_index()
+ region_prevalance = region_prevalance[region_prevalance["count"] > 100]
+ df = priorities[priorities["Postal Region 2"].isin(region_prevalance["Postal Region 2"].values)]
+
+ df["Postal Region"].value_counts()
+ df["Postal Region 2"].value_counts()
+
+ if df["Installed under ECO3"].sum():
+ raise ValueError("There are properties in the priority list that were installed under ECO3")
+
+ df.to_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives - "
+ "revised list.csv",
+ index=False
+ )
# We save the data locally
- stonewater_cavity_properties.to_csv(
- "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties - priority "
- "postcodes.csv",
- index=False
- )
- additional_properties2.to_csv(
- "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties - "
- "non-priority postcodes.csv",
- index=False
- )
- # Save the survey findings
- needs_cwi.to_csv(
- "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - WIP.csv",
- index=False
- )
+ # stonewater_cavity_properties.to_csv(
+ # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Cavity Properties - priority "
+ # "postcodes.csv",
+ # index=False
+ # )
+ # additional_properties2.to_csv(
+ # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Additional Cavity Properties - "
+ # "non-priority postcodes.csv",
+ # index=False
+ # )
+ # # Save the survey findings
+ # needs_cwi.to_csv(
+ # "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI -
+ # WIP.csv",
+ # index=False
+ # )
def cross_reference_epc_programme():
@@ -528,6 +449,12 @@ def cross_reference_epc_programme():
"SURVEYED - ECO3 NOT COMPLETED.xlsx"
)
+ for _, x in eco3_fallout.iterrows():
+ house_no = SearchEpc.get_house_number(x["ADDRESS"], "")
+ if house_no is None:
+ house_no = x["ADDRESS"].split(",")[0]
+ x["house_number"] = house_no
+
eco3_fallout["house_number"] = eco3_fallout.apply(
lambda x: SearchEpc.get_house_number(x["ADDRESS"], ""), axis=1
)
@@ -558,3 +485,58 @@ def cross_reference_epc_programme():
stonewater_modelled_above_c["Address"].apply(lambda x: fuzz.ratio(x, property["ADDRESS"]) > 90)
]
match.head()
+
+
+def finalise_list_for_non_intrusives():
+ non_intrusives_list = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/20250207 Stonewater "
+ "Non-Intrusives.xlsx"
+ )
+
+ # Remove anything installed under ECO3
+ non_intrusives_list = non_intrusives_list[~non_intrusives_list["Installed under ECO3"]]
+
+ # We make any properties that were surveyed by Osmosis
+ packages = pd.read_excel(
+ "/Users/khalimconn-kowlessar/Downloads/Stonewater - Bid Packages WIP 14.11.20 V2 "
+ "(1).xlsx",
+ header=13,
+ sheet_name="Modelled Packages"
+ )
+
+ non_intrusives_list["Surveyed by Osmosis"] = non_intrusives_list["Address ID"].isin(
+ packages["Address ID"].values
+ )
+ # Removed 54 addresses
+ final_non_intrusives = non_intrusives_list[
+ ~non_intrusives_list["Surveyed by Osmosis"]
+ ]
+
+ features = pd.read_csv(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - "
+ "master sheet.csv",
+ encoding='latin1'
+ )
+
+ # Add on the orgnisaion reference
+ final_non_intrusives = final_non_intrusives.merge(
+ features[["Organisation Reference", "Address ID"]],
+ how="left",
+ on="Address ID"
+ )
+
+ final_non_intrusives["Postal Region"] = final_non_intrusives["Postcode"].str.split(" ").str[0].str[0:2]
+ selected_regions = final_non_intrusives[
+ final_non_intrusives["Include in non-intrusives"]
+ ]["Postcode"].unique()
+
+ final_non_intrusives["Is in region"] = final_non_intrusives["Postcode"].isin(selected_regions)
+
+ # Filter down:
+ final_non_intrusives = final_non_intrusives[
+ final_non_intrusives["Is in region"]
+ ]
+
+ final_non_intrusives.to_excel(
+ "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Non-intrusives/10022025 Non-Intrusives "
+ "List - final.xlsx")
diff --git a/etl/find_my_epc/AssetListEpcData.py b/etl/find_my_epc/AssetListEpcData.py
index bce8cd1f..1d2e1472 100644
--- a/etl/find_my_epc/AssetListEpcData.py
+++ b/etl/find_my_epc/AssetListEpcData.py
@@ -72,12 +72,20 @@ class AssetListEpcData:
epc_searcher.find_property(skip_os=True)
if epc_searcher.newest_epc is None:
continue
-
- find_epc_searcher = RetrieveFindMyEpc(
- address=epc_searcher.newest_epc["address1"],
- postcode=epc_searcher.newest_epc["postcode"]
- )
- find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+ # Attempt both methods:
+ try:
+ find_epc_searcher = RetrieveFindMyEpc(
+ address=epc_searcher.newest_epc["address1"] + ", " + epc_searcher.newest_epc["address2"],
+ postcode=epc_searcher.newest_epc["postcode"]
+ )
+ find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
+ except Exception as e:
+ logger.error(f"Error retrieving find my epc data: {e}")
+ find_epc_searcher = RetrieveFindMyEpc(
+ address=epc_searcher.newest_epc["address1"],
+ postcode=epc_searcher.newest_epc["postcode"]
+ )
+ find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
time.sleep(0.5)
# We need uprn
diff --git a/etl/find_my_epc/RetrieveFindMyEpc.py b/etl/find_my_epc/RetrieveFindMyEpc.py
index f93a5a73..9852cc0d 100644
--- a/etl/find_my_epc/RetrieveFindMyEpc.py
+++ b/etl/find_my_epc/RetrieveFindMyEpc.py
@@ -25,6 +25,7 @@ class RetrieveFindMyEpc:
self.postcode = postcode
self.address_cleaned = self.address.replace(",", "").replace(" ", "").lower()
+ self.walls = []
@staticmethod
def extract_low_carbon_sources(soup):
@@ -102,6 +103,8 @@ class RetrieveFindMyEpc:
# 2) Bills estimates
# 3) Recommendations and SAP points
# 4) Low and zero carbon energy sources
+ # 5) The wall types of the property - used for determining if we have an extension wall insulation#
+ # recommendation
ratings = address_res.find('desc', {'id': 'svg-desc'}).text
current_rating = ratings.split(".")[0]
@@ -208,6 +211,17 @@ class RetrieveFindMyEpc:
if key not in assessment_data:
raise ValueError(f"Missing key: {key}")
+ # The wall types of the property
+ property_features_table = address_res.find("tbody", class_="govuk-table__body")
+ property_features_table = property_features_table.find_all("tr")
+
+ # Extract wall types
+ self.walls = []
+ for row in property_features_table:
+ cells = row.find_all("td")
+ if row.find("th").text.strip() == "Wall":
+ self.walls.append(cells[0].text.strip())
+
# Finally, we format the recommendations
recommendations = self.format_recommendations(recommendations, assessment_data, sap_2012_date)
@@ -229,8 +243,7 @@ class RetrieveFindMyEpc:
return resulting_data
- @staticmethod
- def format_recommendations(recommendations, assessment_data, sap_2012_date=None):
+ def format_recommendations(self, recommendations, assessment_data, sap_2012_date=None):
"""
This function converts the recommendations to a format that we can use in the engine as a non-intrusive survey
:param recommendations: The recommendations from the EPC
@@ -317,7 +330,8 @@ class RetrieveFindMyEpc:
"roomstat_programmer_trvs", "time_temperature_zone_control"
],
"Replacement warm air unit": [],
- "Secondary glazing": ["secondary_glazing"]
+ "Secondary glazing": ["secondary_glazing"],
+ "Condensing heating unit": ["boiler_upgrade"],
}
survey = True
@@ -330,6 +344,8 @@ class RetrieveFindMyEpc:
for rec in recommendations:
mapped = measure_map[rec["measure"]]
for measure in mapped:
+ if measure == "cavity_wall_insulation" and "solid brick" in self.walls[0].lower():
+ measure = "extension_cavity_wall_insulation"
to_append = {
"type": measure,
"sap_points": rec["sap_points"],
diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py
deleted file mode 100644
index 8d19aa84..00000000
--- a/etl/route_march_data_pull/app.py
+++ /dev/null
@@ -1,396 +0,0 @@
-import os
-import time
-
-import pandas as pd
-import numpy as np
-from tqdm import tqdm
-
-from dotenv import load_dotenv
-from backend.SearchEpc import SearchEpc
-from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
-from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
-
-from recommendations.recommendation_utils import (
- estimate_perimeter,
- estimate_external_wall_area,
- estimate_number_of_floors
-)
-
-load_dotenv(dotenv_path="backend/.env")
-EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
-
-
-def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map):
- epc_data = []
- errors = []
- no_epc = []
- for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
- try:
- postcode = home[postcode_column]
- house_number = home[address1_column].strip()
- full_address = home[fulladdress_column].strip()
- house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode)
- if house_no is None:
- house_no = house_number
- uprn = manual_uprn_map.get(full_address, None)
-
- searcher = SearchEpc(
- address1=str(house_no),
- postcode=postcode,
- auth_token=EPC_AUTH_TOKEN,
- os_api_key="",
- property_type=None,
- fast=True,
- full_address=full_address,
- max_retries=5,
- uprn=uprn
- )
- # Force the skipping of estimating the EPC
- searcher.ordnance_survey_client.property_type = None
- searcher.ordnance_survey_client.built_form = None
-
- searcher.find_property(skip_os=True)
-
- # Check if we have a flat or appartment
- if searcher.newest_epc is None and uprn is None:
- # Try again:
- if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None:
- # Backup
- add1 = full_address.split(",")
- if len(add1) > 1:
- add1 = add1[1].strip()
- else:
- # Try splitting on space
- add1 = full_address.split(" ")[0].strip()
-
- else:
- add1 = str(house_number)
- searcher = SearchEpc(
- address1=add1,
- postcode=postcode,
- auth_token=EPC_AUTH_TOKEN,
- os_api_key="",
- property_type=None,
- fast=True,
- full_address=full_address,
- max_retries=5
- )
-
- if (
- "flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in
- house_number.lower()
- ):
- searcher.ordnance_survey_client.property_type = "Flat"
-
- searcher.find_property(skip_os=True)
-
- if searcher.newest_epc is None:
- no_epc.append(home["row_id"])
- continue
-
- # Look for EPC recommendatons
- try:
- property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
- except:
- property_recommendations = {"rows": []}
-
- # Retrieve data from FindMyEPC
- try:
- find_epc_searcher = RetrieveFindMyEpc(
- address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"]
- )
- find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
- except ValueError as e:
- if "No EPC found" in str(e) and "address1" in searcher.newest_epc:
- find_epc_searcher = RetrieveFindMyEpc(
- address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"]
- )
- find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
- else:
- find_epc_data = {}
- except Exception as e:
- raise Exception(f"Error retrieving FindMyEPC data: {e}")
- time.sleep(np.random.uniform(0.1, 1))
-
- epc = {
- "row_id": home["row_id"],
- **searcher.newest_epc.copy(),
- "recommendations": property_recommendations["rows"],
- "find_my_epc_data": find_epc_data,
- }
-
- epc_data.append(epc)
- except Exception as e:
- errors.append(home["row_id"])
- time.sleep(5)
-
- return epc_data, errors, no_epc
-
-
-def extract_address1(asset_list, full_address_col, method="first_two_words"):
- if method == "first_two_words":
- asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
- return asset_list
-
- if method == "first_word":
- asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0]
- return asset_list
-
- raise ValueError(f"Method {method} not recognized")
-
-
-def app():
- """
- This app is EPC pulling data for some properties owned by Livewest
-
- Data request contents:
- Date of last EPC
- Reason for EPC
- SAP score on register
- Property Type
- Property Area
- Property Age
- Any Dimensions (HLP,PW,RH)
- Property Wall Construction
- Heating Type
- Secondary Heating
- Loft Insulation Depth
-
- Additional if possible:
- Heat loss calculations
- EPC recommendations
- Property UPRN
-
- """
- DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Southern"
- DATA_FILENAME = "January 2025 Additions Query.xlsx"
- SHEET_NAME = "Jan 2025 additions"
- POSTCODE_COLUMN = "Post Code"
- FULLADDRESS_COLUMN = "Street / Block Name"
- ADDRESS1_COLUMN = None
- ADDRESS1_METHOD = "first_word"
- ADDRESS_COLS_TO_CONCAT = []
-
- # Maps addresses to uprn in problematic cases
- MANUAL_UPRN_MAP = {
- "Ardelagh Ardelagh Faris Lane Woodham Addlestone KT15 3DJ": 100061484560
- }
-
- asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)
- asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index()
- asset_list["row_id"] = asset_list.index
-
- # We clean up portential non-breaking spaces, and double spaces
- for col in [c for c in [POSTCODE_COLUMN, FULLADDRESS_COLUMN, ADDRESS1_COLUMN] if c is not None]:
- asset_list[col] = asset_list[col].astype(str)
- asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False)
- asset_list[col] = asset_list[col].str.replace(' ', ' ', regex=False)
-
- if ADDRESS1_COLUMN is None:
- ADDRESS1_COLUMN = "address1_extracted"
- asset_list = extract_address1(
- asset_list=asset_list, full_address_col=FULLADDRESS_COLUMN, method=ADDRESS1_METHOD
- )
-
- if FULLADDRESS_COLUMN is None:
- FULLADDRESS_COLUMN = "fulladdress_extracted"
- # We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas
- asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(lambda x: ", ".join(x), axis=1)
-
- # We check for duplicated addresses
- asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN]
- if asset_list["deduper"].duplicated().sum():
- # Drop the dupes
- print(f"There are {asset_list['deduper'].duplicated().sum()} duplicated addresses - dropping")
- asset_list = asset_list[~asset_list["deduper"].duplicated()]
- asset_list = asset_list.drop(columns=["deduper"])
-
- epc_data, errors, no_epc = get_data(
- asset_list=asset_list,
- fulladdress_column=FULLADDRESS_COLUMN,
- address1_column=ADDRESS1_COLUMN,
- postcode_column=POSTCODE_COLUMN,
- manual_uprn_map=MANUAL_UPRN_MAP
- )
-
- # We now retrieve any failed properties
- asset_list_failed = asset_list[asset_list["row_id"].isin(errors)]
- epc_data_failed, _, _ = get_data(
- asset_list=asset_list_failed,
- fulladdress_column=FULLADDRESS_COLUMN,
- address1_column=ADDRESS1_COLUMN,
- postcode_column=POSTCODE_COLUMN,
- manual_uprn_map=MANUAL_UPRN_MAP
- )
-
- no_data = asset_list[asset_list["row_id"].isin(no_epc)]
- print(no_data[[FULLADDRESS_COLUMN, POSTCODE_COLUMN]])
-
- # Append the failed data to the main data
- epc_data.extend(epc_data_failed)
-
- epc_df = pd.DataFrame(epc_data)
-
- # We expand out the recommendations
- recommendations_df = epc_df[["row_id", "recommendations"]]
-
- unique_recommendations = set()
- for _, row in recommendations_df.iterrows():
- unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]])
-
- columns = ["row_id"] + list(unique_recommendations)
- transformed_data = []
- for _, row in recommendations_df.iterrows():
- # Initialize a dictionary for this row with False for all recommendations
- row_data = {col: False for col in columns}
- row_data["row_id"] = row["row_id"]
-
- # Set True for each recommendation present in this row
- for rec in row["recommendations"]:
- recommendation_text = rec["improvement-summary-text"]
- row_data[recommendation_text] = True
-
- # Append the row data to transformed_data
- transformed_data.append(row_data)
-
- transformed_df = pd.DataFrame(transformed_data)
- # Drop the column that is ""
- if "" in transformed_df.columns:
- transformed_df = transformed_df.drop(columns=[""])
-
- # Get the find my epc data
- find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join(
- pd.json_normalize(epc_df["find_my_epc_data"])
- )
- # We check if we get the solar pv column:
- if "Solar photovoltaics" not in find_my_epc_data.columns:
- find_my_epc_data["Solar photovoltaics"] = False
-
- # Retrieve just the data we need
- epc_df = epc_df[
- [
- "row_id",
- "uprn",
- "address1",
- "address",
- "postcode",
- "property-type",
- "built-form",
- "inspection-date",
- "current-energy-rating",
- "current-energy-efficiency",
- "roof-description",
- "walls-description",
- "floor-description",
- "transaction-type",
- # New fields needed
- "secondheat-description",
- "total-floor-area",
- "construction-age-band",
- "floor-height",
- "number-habitable-rooms",
- "mainheat-description",
- #
- "energy-consumption-current", # kwh/m2
- "photo-supply",
- ]
- ].rename(columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"})
-
- asset_list = asset_list.merge(
- epc_df,
- how="left",
- on="row_id"
- ).merge(
- find_my_epc_data[
- [
- "row_id", "heating_text", "hot_water_text", 'Assessor’s name',
- "Assessor's Telephone", "Assessor's Email", "Accreditation scheme",
- "Assessor’s ID", "Solar photovoltaics"
- ]
- ].rename(
- columns={
- "Solar photovoltaics": "Has Solar PV",
- "heating_text": "Heating Estimated kWh",
- "hot_water_text": "Hot Water Estimated kWh",
- }
- ),
- how="left",
- on="row_id"
- )
-
- asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""])
- asset_list = asset_list.drop(columns=["photo-supply"])
-
- # Rename the columns
- asset_list = asset_list.rename(columns={
- "inspection-date": "Date of last EPC",
- "current-energy-efficiency": "SAP score on register",
- "current-energy-rating": "EPC rating on register",
- "property-type": "Property Type",
- "built-form": "Archetype",
- "total-floor-area": "Property Floor Area",
- "construction-age-band": "Property Age Band",
- "floor-height": "Property Floor Height",
- "number-habitable-rooms": "Number of Habitable Rooms",
- "walls-description": "Wall Construction",
- "roof-description": "Roof Construction",
- "floor-description": "Floor Construction",
- "mainheat-description": "Heating Type",
- "secondheat-description": "Secondary Heating",
- "transaction-type": "Reason for last EPC",
- "energy-consumption-current": "Heat Demand (kWh/m2)",
- })
-
- asset_list["Estimated Number of Floors"] = asset_list.apply(
- lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull(
- x["Property Type"]) else None, axis=1
- )
-
- asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float)
- # Replace "" value with None
- asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None)
- asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float)
-
- asset_list["Estimated Perimeter (m)"] = asset_list.apply(
- lambda x: estimate_perimeter(
- floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"],
- num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"],
- ), axis=1
- )
-
- asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply(
- lambda x: estimate_external_wall_area(
- num_floors=x["Estimated Number of Floors"],
- floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5,
- perimeter=x["Estimated Perimeter (m)"],
- built_form=x["Archetype"]
- ),
- axis=1
- )
-
- asset_list["Roof Insulation Thickness"] = asset_list.apply(
- lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull(
- x["Roof Construction"]) else None,
- axis=1
- )
-
- # For all of the columns in transformed_df, prefix with "Recommendation: "
- for col in transformed_df.columns:
- if col == "row_id":
- continue
- transformed_df = transformed_df.rename(columns={col: f"Recommendation: {col}"})
-
- asset_list = asset_list.merge(
- transformed_df,
- how="left",
- on="row_id"
- )
- asset_list = asset_list.drop(columns=["row_id", "index"])
-
- # Store as an excel
- filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull - Main.xlsx"
- asset_list.to_excel(filename, index=False)
-
- matches_review = asset_list[
- [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"]
- ]
diff --git a/recommendations/HeatingRecommender.py b/recommendations/HeatingRecommender.py
index c5c07f89..e4dd3a78 100644
--- a/recommendations/HeatingRecommender.py
+++ b/recommendations/HeatingRecommender.py
@@ -852,6 +852,8 @@ class HeatingRecommender:
else:
heating_simulation_config["mainheat_energy_eff_ending"] = self.property.data["mainheat-energy-eff"]
+ # TODO:We possibly shouldn't touch the hot water energy efficiency if we aren't recommending dual immersion
+ # we'll keep this for the moment though
if self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor"]:
heating_simulation_config["hot_water_energy_eff_ending"] = "Average"
else:
@@ -993,7 +995,7 @@ class HeatingRecommender:
# We check if there's a mains connection and the hot water is inefficient, as this will improve with a boiler
has_inefficient_water = (
self.property.data["mains-gas-flag"] and
- self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor", "Average"]
+ self.property.data["hot-water-energy-eff"] in ["Very Poor", "Poor"]
)
non_invasive_recommendation = next((
diff --git a/recommendations/Recommendations.py b/recommendations/Recommendations.py
index 15614a0b..715332a5 100644
--- a/recommendations/Recommendations.py
+++ b/recommendations/Recommendations.py
@@ -503,7 +503,9 @@ class Recommendations:
impact_summary.append(
{
"phase": rec["phase"],
+ "representative": rec["recommendation_id"] in representative_ids,
"recommendation_id": rec["recommendation_id"],
+ "measure_type": rec["measure_type"],
"sap": sap + rec["sap_points"],
"carbon": carbon - rec["co2_equivalent_savings"],
"heat_demand": heat_demand - rec["heat_demand"],
@@ -621,6 +623,13 @@ class Recommendations:
if li_sap_limit is not None:
property_phase_impact["sap"] = min(property_phase_impact["sap"], li_sap_limit)
+ if rec["type"] == "solar_pv":
+ # We use the SAP points in the recommendation as a minimum
+ property_phase_impact["sap"] = (
+ rec["sap_points"] if property_phase_impact["sap"] < rec["sap_points"] else
+ property_phase_impact["sap"]
+ )
+
# Insert this information into the recommendation.
if not rec.get("survey", False):
rec["sap_points"] = property_phase_impact["sap"]
@@ -647,7 +656,9 @@ class Recommendations:
return property_recommendations, impact_summary
@staticmethod
- def map_descriptions_to_fuel(heating_description, hotwater_description, main_fuel_description):
+ def map_descriptions_to_fuel(
+ heating_description, hotwater_description, main_fuel_description, descriptions_to_fuel_types
+ ):
# Handle the case of community schemes
if (heating_description == "Community scheme") or (hotwater_description == "Community scheme"):
@@ -660,7 +671,7 @@ class Recommendations:
}
raise NotImplementedError("Handle this case")
- mapped = assumptions.DESCRIPTIONS_TO_FUEL_TYPES[heating_description]
+ mapped = descriptions_to_fuel_types[heating_description]
heating_fuel = mapped["fuel"]
if hotwater_description in [
@@ -680,7 +691,7 @@ class Recommendations:
"heating_cop": mapped["cop"], "hotwater_cop": 1
}
- mapped_hotwater = assumptions.DESCRIPTIONS_TO_FUEL_TYPES[hotwater_description]
+ mapped_hotwater = descriptions_to_fuel_types[hotwater_description]
return {
"heating_fuel_type": heating_fuel, "hotwater_fuel_type": mapped_hotwater["fuel"],
@@ -689,7 +700,7 @@ class Recommendations:
@classmethod
def calculate_recommendation_tenant_savings(
- cls, property_instance, kwh_simulation_predictions, property_recommendations
+ cls, property_instance, kwh_simulation_predictions, property_recommendations, ashp_cop=None
):
"""
This method inserts the kwh savings and the bill savings that the customer will make from the recommendations
@@ -701,9 +712,12 @@ class Recommendations:
:param property_instance: Instance of the Property class, for the home associated to property_id
:param kwh_simulation_predictions: dictionary of predictions from the model apis
:param property_recommendations: dictionary of recommendations for the property
+ :param ashp_cop: The coefficient of performance for the air source heat pump.
:return:
"""
+ ashp_cop = ashp_cop if ashp_cop else assumptions.AVERAGE_ASHP_EFFICIENCY
+
kwh_impact_table = kwh_simulation_predictions["heating_kwh_predictions"][
kwh_simulation_predictions["heating_kwh_predictions"]["property_id"] == str(property_instance.id)
].merge(
@@ -772,12 +786,19 @@ class Recommendations:
if kwh_impact_table.loc[i, col] > previous_phase[col].max():
kwh_impact_table.loc[i, col] = previous_phase[col].max()
+ descriptions_to_fuel_types = assumptions.DESCRIPTIONS_TO_FUEL_TYPES
+ # We will the air source heat pump efficiencies
+ ashp_keys = [k for k in descriptions_to_fuel_types.keys() if "air source heat pump" in k.lower()]
+ for k in ashp_keys:
+ descriptions_to_fuel_types[k]["cop"] = ashp_cop
+
# For heating system recommendations, this could result in a fuel type change so we reflect that
fuel_mapping = pd.DataFrame([
{
"id": epc["id"],
**cls.map_descriptions_to_fuel(
- epc["mainheat-description"], epc["hotwater-description"], epc["main-fuel"]
+ epc["mainheat-description"], epc["hotwater-description"], epc["main-fuel"],
+ descriptions_to_fuel_types
)
} for epc in property_instance.updated_simulation_epcs
])
@@ -791,7 +812,8 @@ class Recommendations:
**cls.map_descriptions_to_fuel(
property_instance.data["mainheat-description"],
property_instance.data["hotwater-description"],
- property_instance.data["main-fuel"]
+ property_instance.data["main-fuel"],
+ descriptions_to_fuel_types
)
}
]
diff --git a/recommendations/SolarPvRecommendations.py b/recommendations/SolarPvRecommendations.py
index 95f189d3..a97dbcb3 100644
--- a/recommendations/SolarPvRecommendations.py
+++ b/recommendations/SolarPvRecommendations.py
@@ -14,11 +14,16 @@ class SolarPvRecommendations:
# This was previously set to 250w, but has been upped to 400 based on the systems used by Cotswolrd Energy Group
SOLAR_PANEL_WATTAGE = 400
+ # For domestic properties, we don't recommend a solar PV system with wattage outside of these
+ # bounds
MAX_SYSTEM_WATTAGE = 6000
MIN_SYSTEM_WATTAGE = 1000
+ # the maximum area of root we allow to be covered in solar panels for our recommendations.
MAX_ROOF_AREA_PERCENTAGE = 0.7
+ SAP_POINTS_PER_5_PERCENT_ROOF_COVERAGE = 1
+
def __init__(self, property_instance):
"""
:param property_instance: Instance of the Property class, for the home associated to property_id
@@ -212,6 +217,20 @@ class SolarPvRecommendations:
roof_coverage_percent = round(recommendation_config["panneled_roof_area"] / roof_area * 100)
# We round up to the nearest 5
roof_coverage_percent = np.ceil(roof_coverage_percent / 5) * 5
+
+ # Typically, we've observed that every 5% of additional roof coverage will result in at least
+ # an additional 1 SAP points (though often 2 points) Given this, we can add a reasonable minimum
+ # for the number of SAP points we might expect. We've observed that for some cases where properties
+ # are hitting the higher SAP scores (e.g. EPC A and above), the model can sometimes under-predict
+ # the number of SAP points. This appears to be due to a relatively small number of properties
+ # actually achieving the upper echelons of EPC rating. This can be the case if we're simulating a
+ # whole house retrofit where the home is getting complete insulation, a heat pump and solar panels.
+ # Because panels are the final recommendation, they are often the measure that takes the home
+ # into the medium to high EPC A ranges and so because of a lack of training data, this means that
+ # we might sometime under-predict. This minimum is intended to try and reduce the negative impact
+ # of this. This minimum is used in Recommendations.calculate_recommendation_impact
+ minimum_sap_points = (roof_coverage_percent / 5) * self.SAP_POINTS_PER_5_PERCENT_ROOF_COVERAGE
+
for has_battery in [False, True]:
cost_result = self.costs.solar_pv(
has_battery=has_battery,
@@ -240,7 +259,7 @@ class SolarPvRecommendations:
"description": description,
"starting_u_value": None,
"new_u_value": None,
- "sap_points": None,
+ "sap_points": minimum_sap_points,
"already_installed": already_installed,
**cost_result,
# This is required for simulating the SAP impact. solar_pv_percentage is between 0 & 1 so we
diff --git a/recommendations/WindowsRecommendations.py b/recommendations/WindowsRecommendations.py
index 1f755369..46e56c93 100644
--- a/recommendations/WindowsRecommendations.py
+++ b/recommendations/WindowsRecommendations.py
@@ -215,21 +215,29 @@ class WindowsRecommendations:
"glazed-type": glazed_type_ending,
}
+ measure_type = "double_glazing" if not is_secondary_glazing else "secondary_glazing"
+
+ non_invasive_recommendation = next(
+ (r for r in self.property.non_invasive_recommendations if r["type"] in ["windows_glazing", measure_type]),
+ {}
+ )
+
self.recommendation = [
{
"phase": phase,
"parts": [],
"type": "windows_glazing",
- "measure_type": "double_glazing" if not is_secondary_glazing else "secondary_glazing",
+ "measure_type": measure_type,
"description": description,
"starting_u_value": None,
"new_u_value": None,
- "sap_points": None,
+ "sap_points": non_invasive_recommendation.get("sap_points", None),
"already_installed": already_installed,
**cost_result,
"is_secondary_glazing": is_secondary_glazing,
"description_simulation": description_simulation,
"simulation_config": simulation_config,
+ "survey": non_invasive_recommendation.get("survey", None),
}
]
diff --git a/recommendations/recommendation_utils.py b/recommendations/recommendation_utils.py
index 00da6107..602684cf 100644
--- a/recommendations/recommendation_utils.py
+++ b/recommendations/recommendation_utils.py
@@ -205,7 +205,7 @@ def get_wall_u_value(
mapped_value = wall_uvalues_df[
wall_uvalues_df["Wall_type"] == mapped_description
- ][age_band].values[0]
+ ][age_band].values[0]
if pd.isnull(mapped_value) and "Park home" in mapped_description:
# We don't know enough in this case so we default to 0
@@ -428,6 +428,9 @@ def estimate_number_of_floors(property_type):
Using the property type, we estimate the number of floors in the property
"""
+ if property_type is None:
+ return None
+
if property_type == "House":
number_of_floors = 2
elif property_type in ["Flat", "Bungalow"]:
@@ -560,7 +563,7 @@ def get_floor_u_value(
insulation_lookup = s11[
s11["Age_band"].str.contains(age_band) & s11["Floor_construction"]
== floor_type
- ]
+ ]
if insulation_lookup.empty:
insulation_thickness = 0
else:
diff --git a/survey_report/app.py b/survey_report/app.py
new file mode 100644
index 00000000..f6eddb8d
--- /dev/null
+++ b/survey_report/app.py
@@ -0,0 +1,270 @@
+import os
+import requests
+import PyPDF2
+from string import Template
+
+import pandas as pd
+
+from survey_report.extraction.detect_report_type import detect_report_type
+from survey_report.extraction.quidos import SiteNotesExtractor, EPRExtractor
+
+
+def generate_html_report(template_path, output_path, data):
+ """
+ Reads an HTML template file, injects dynamic values, and generates a final HTML report.
+
+ Args:
+ - template_path (str): Path to the HTML template file.
+ - output_path (str): Path to save the generated HTML file.
+ - data (dict): Dictionary containing dynamic values for the report.
+ """
+ # Read the template file
+ with open(template_path, "r", encoding="utf-8") as f:
+ html_template = Template(f.read()) # Use Template from string module
+
+ # Replace placeholders with actual data
+ final_html = html_template.safe_substitute(data) # Use safe_substitute to prevent missing key errors
+
+ # Save the generated HTML file
+ with open(output_path, "w", encoding="utf-8") as f:
+ f.write(final_html)
+
+ print(f"HTML report generated successfully: {output_path}")
+
+
+def stringify_number(num: int, rounding: bool = True) -> str:
+ if num < 100000: # 5 figures or fewer
+ rounded_num = ((num + 99) // 100) * 100 if rounding else num
+ return f"{rounded_num:,}"
+ else: # More than 5 figures
+ rounded_num = ((num + 999) // 1000) * 1000 if rounding else num
+ return f"{rounded_num // 1000}k"
+
+
+class PlacidApi:
+ # Errors as defined by docs: https://placid.app/docs/2.0/rest/errors
+ ERROR_CODES = {
+ 400: "Bad request",
+ 401: "Unauthorized",
+ 404: "Template Not found",
+ 422: "Validation error",
+ 429: "Rate limit exceeded",
+ 500: "Internal server error",
+ }
+
+ def __init__(self, api_key):
+ self.api_key = api_key
+
+ self.headers = {
+ "Authorization": f"Bearer {self.api_key}",
+ "Content-Type": "application/json",
+ "Accept": "application/json",
+ }
+
+ def create_pdf(
+ self,
+ template_uuid: str,
+ current_epc_rating: str,
+ current_epc_rating_colour: str,
+ post_retrofit_epc_rating: str,
+ post_retrofit_epc_rating_colour: str,
+ ):
+ url = "https://api.placid.app/api/rest/pdfs"
+
+ body = {
+ "webhook_success": None,
+ "passthrough": None,
+ "pages": [
+ {
+ "template_uuid": template_uuid,
+ "layers": {
+ "current_epc_rating": {
+ "text": current_epc_rating,
+ "text_color": current_epc_rating_colour,
+ },
+ "post_retrofit_epc_rating": {
+ "text": post_retrofit_epc_rating,
+ "text_color": post_retrofit_epc_rating_colour,
+ }
+ },
+ },
+ ]
+ }
+
+ response = requests.post(
+ url,
+ headers=self.headers,
+ json=body
+ )
+
+ response_body = response.json()
+
+ return response_body
+
+ def get_pdf(self, pdf_id: str):
+ """
+ Poll the API every 5 seconds until the PDF is ready
+ """
+ url = f"https://api.placid.app/api/rest/pdfs/{pdf_id}"
+
+ response = requests.get(
+ url,
+ headers=self.headers
+ )
+ response_body = response.json()
+
+ url = response_body["pdf_url"]
+ # Download the PDF form this uurl
+ pdf_download = requests.get(url)
+ with open("survey_report/example_data/output.pdf", "wb") as f:
+ f.write(pdf_download.content)
+
+
+def handler():
+ """
+ Performs the data extraction process for the survey report
+ :return:
+ """
+
+ PLACID_API_KEY = "placid-mpkwidzer2mens9h-hifa3dmbxpfeghpa"
+ TEMPLATE_UUID = "5bst9mh1q9lk9"
+ placid_api = PlacidApi(PLACID_API_KEY)
+
+ current_property_value = 250000 # Needs to be an input
+
+ EPC_COLOURS = {
+ "A": "#117d58",
+ "B": "#2da55c",
+ "C": "#8dbd40",
+ "D": "#f7cd14",
+ "E": "#f3a96a",
+ "F": "#ef8026",
+ "G": "#e41e3b",
+ }
+
+ folders = [
+ {
+ "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 "
+ "WILLIS ROAD FLAT 1 PRE EPR SITE NOTES.pdf",
+ "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1/3 WILLIS "
+ "ROAD FLAT 1 PRE EPR PDF.pdf",
+ "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data"
+ "/Flat 1/3 WILLIS ROAD FLAT 1 POST EPR SITE NOTES.pdf"
+ },
+ {
+ "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 "
+ "WILLIS ROAD FLAT 2 PRE EPR SITE NOTES.pdf",
+ "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2/3 WILLIS "
+ "ROAD FLAT 2 PRE EPR PDF.pdf",
+ "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data"
+ "/Flat 2/3 WILLIS ROAD FLAT 2 POST EPR SITE NOTES.pdf"
+ },
+ {
+ "site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 "
+ "WILLIS ROAD FLAT 3 PRE EPR SITE NOTES.pdf",
+ "epr": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3/3 WILLIS "
+ "ROAD FLAT 3 PRE EPR PDF.pdf",
+ "scenario_site_notes": "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data"
+ "/Flat 3/3 WILLIS ROAD FLAT 3 POST EPR SITE NOTES.pdf"
+ },
+ ]
+
+ data = []
+ for data_config in folders:
+
+ file_mapping = {}
+ for filename, filepath in data_config.items():
+ with (open(filepath, "rb") as f):
+ pdf = PyPDF2.PdfReader(f)
+ first_page = pdf.pages[0].extract_text()
+ text = ""
+ for page in pdf.pages:
+ text += page.extract_text()
+
+ # Check the report type
+ report_type = detect_report_type(first_page)
+ if report_type is not None:
+ file_mapping[filename] = text
+
+ # This is only set up to work with quido site notes so we must have it
+ site_notes_extractor = SiteNotesExtractor(file_mapping["site_notes"])
+ site_notes = site_notes_extractor.extract_all()
+
+ # We also must have an EPR
+ epr_extractor = EPRExtractor(file_mapping["epr"])
+ epr = epr_extractor.extract_all()
+
+ # Valuation simulation
+ scenario_site_notes_extractor = SiteNotesExtractor(file_mapping["scenario_site_notes"])
+ scenario_site_notes = scenario_site_notes_extractor.extract_all()
+
+ from backend.ml_models.Valuation import PropertyValuation
+ valuation_uplift = PropertyValuation.estimate_valuation_improvement(
+ current_value=current_property_value,
+ current_epc=site_notes["Current EPC Band"],
+ target_epc=scenario_site_notes["Current EPC Band"],
+ )
+ # TODO - should convert this, when it's more than 5 figures and we should certainly stringify this
+
+ valuation_difference = round(valuation_uplift["average_increased_value"] - current_property_value)
+
+ # Prepare the data for output
+ bill_savings = round(
+ site_notes['Estimated Annual Energy Cost (£)'] - scenario_site_notes['Estimated Annual Energy Cost (£)']
+ )
+
+ carbon_savings = round(
+ site_notes["Current Carbon Emissions (TCO2)"] - scenario_site_notes["Current Carbon Emissions (TCO2)"],
+ 2
+ )
+
+ payback_period = None
+ if payback_period is None:
+ raise NotImplementedError("Implement me")
+
+ # We extract the measures from the site notes
+
+ report_data = {
+ "current_epc_rating": site_notes["Current EPC Band"],
+ "current_epc_rating_colour": EPC_COLOURS[site_notes["Current EPC Band"]],
+ "post_retrofit_epc_rating": scenario_site_notes["Current EPC Band"],
+ "post_retrofit_epc_rating_colour": EPC_COLOURS[scenario_site_notes["Current EPC Band"]],
+ "bill_savings": stringify_number(bill_savings),
+ "valuation_improvement": stringify_number(valuation_difference),
+ "carbon_savings": carbon_savings,
+
+ }
+
+ # We now produce the combined data sheet which is the starting figure:
+ # data_sheet = {**epr, **site_notes}
+ # del data_sheet['Building Dimensions']
+ # # We unnest the Total Building Dimensions
+ # data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"]
+ # data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"]
+ # del data_sheet["Total Building Dimensions"]
+
+ create_pdf_response = placid_api.create_pdf(
+ template_uuid=TEMPLATE_UUID, **report_data
+ )
+ # {'id': 769832, 'type': 'pdf', 'status': 'queued', 'pdf_url': None, 'transfer_url': None, 'passthrough': None}
+ # Download locally
+ placid_api.get_pdf(create_pdf_response["id"])
+
+ data = pd.DataFrame(data)
+
+ # Generate the HTML report
+ # Placeholder locations
+ template_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/template.html"
+ output_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/output/report.html"
+ logo_path = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/assets/logo.png"
+ generate_html_report(
+ template_path, output_path,
+ data={
+ "address": data_sheet["Address"],
+ "logo_path": logo_path,
+ "current_epc": data_sheet["Current EPC Band"],
+ "current_sap": data_sheet["Current SAP Rating"],
+ "potential_epc": "A", # TODO PLACEHOLDER
+ "potential_sap": 91, # TODO PLACEHOLDER
+ }
+ )
diff --git a/survey_report/extraction/detect_report_type.py b/survey_report/extraction/detect_report_type.py
new file mode 100644
index 00000000..434a3fb4
--- /dev/null
+++ b/survey_report/extraction/detect_report_type.py
@@ -0,0 +1,22 @@
+import re
+
+
+def detect_report_type(first_page):
+ """
+ Detects the type of report based on the first page of the report
+ :param first_page:
+ :return:
+ """
+ # Set up for the minute to handle quidos files. We have the Elmhurst logic so we can introduce
+ # this when we need
+
+ if re.match(
+ r"^Created \d{2}/\d{2}/\d{4} for Quidos Ltd using Argyle software BRE approved calculator",
+ first_page
+ ):
+ return "quidos_site_notes"
+
+ if re.search(r"\nIQ-Energy\nEnergy Performance Report\nPage 1 of 1", first_page):
+ return "quidos_epr"
+
+ return None
diff --git a/survey_report/extraction/quidos.py b/survey_report/extraction/quidos.py
new file mode 100644
index 00000000..2e772886
--- /dev/null
+++ b/survey_report/extraction/quidos.py
@@ -0,0 +1,256 @@
+import re
+
+
+class SiteNotesExtractor:
+ """
+ Extracts SAP rating, carbon emissions, and building dimensions from an EPC summary report.
+ """
+
+ def __init__(self, pdf_text):
+ """
+ Initializes the SiteNotesExtractor with the extracted PDF text.
+ """
+ self.text = pdf_text
+ self.data = {}
+
+ def extract_sap_rating(self):
+ """
+ Extracts the current and potential SAP rating from the report.
+ """
+ pattern = re.search(r"Current SAP rating\s*([A-G])\s*(\d+)\s*Potential SAP rating\s*([A-G])\s*(\d+)", self.text)
+
+ if not pattern:
+ raise ValueError("No SAP rating found in the report")
+
+ self.data.update({
+ "Current EPC Band": pattern.group(1),
+ "Current SAP Rating": int(pattern.group(2)),
+ "Potential EPC Band": pattern.group(3),
+ "Potential SAP Rating": int(pattern.group(4)),
+ })
+
+ def extract_carbon_emissions(self):
+ """
+ Extracts the current and adjusted annual carbon emissions (TCO2).
+ """
+ pattern = re.search(r"Current annual emissions\s*([\d.]+)\s*\(TCO2\)", self.text)
+
+ if not pattern:
+ raise ValueError("No carbon emissions found in the report")
+
+ self.data.update({
+ "Current Carbon Emissions (TCO2)": float(pattern.group(1)),
+ })
+
+ def extract_building_dimensions(self):
+ """
+ Extracts dimensions for each building part and stores them in a list.
+ Handles Main Property and multiple extensions.
+ """
+
+ # Locate the Dimensions section
+ dimensions_section = re.search(
+ r"Dimension Type (?:internal|external)\nPart Floor Area \(m2\) Room Height \(m\) Loss Perimeter \(m\) "
+ r"Party Wall "
+ r"Length \(m\)\n"
+ r"(.*?)\n5\.0 Conservatory", self.text, re.DOTALL
+ )
+
+ if not dimensions_section:
+ raise ValueError("Failed to locate the dimensions section in the text.")
+
+ dimensions_text = dimensions_section.group(1)
+
+ # Pattern to match each building part (Main Property, Extension 1, Extension 2, etc.)
+ building_part_pattern = re.compile(
+ r"(Main Property|Extension \d+)\s*(?:Property)?\s*([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"
+ )
+
+ building_parts = []
+ for match in building_part_pattern.finditer(dimensions_text):
+ to_append = {
+ "Building Part": match.group(1).strip(),
+ "Part Floor Area (m2)": float(match.group(2)),
+ "Room Height (m)": float(match.group(3)),
+ "Loss Perimeter (m)": float(match.group(4)),
+ "Party Wall Length (m)": float(match.group(5)),
+ }
+ # We calculate the heat loss area
+ to_append["Heat Loss Area (m2)"] = to_append["Loss Perimeter (m)"] * to_append["Room Height (m)"]
+ building_parts.append(to_append)
+
+ if not building_parts:
+ raise ValueError("No building dimensions found in the report")
+
+ self.data["Building Dimensions"] = building_parts
+ # We calculate some totals
+ self.data["Total Building Dimensions"] = {
+ "floor_area": sum([part["Part Floor Area (m2)"] for part in building_parts]),
+ "heat_loss_area": sum([part["Heat Loss Area (m2)"] for part in building_parts]),
+ }
+
+ def extract_bills_estimate(self):
+ """
+ Extracts the estimated annual energy costs (£) from the report.
+ """
+ pattern = re.search(r"Current annual energy costs £\s*([\d,.]+)", self.text)
+
+ if not pattern:
+ raise ValueError("No bills estimate found in the report")
+
+ self.data["Estimated Annual Energy Cost (£)"] = float(pattern.group(1).replace(",", ""))
+
+ def extract_all(self):
+ """
+ Runs all extraction methods and returns a dictionary with extracted data.
+ """
+ self.extract_sap_rating()
+ self.extract_carbon_emissions()
+ self.extract_bills_estimate()
+ self.extract_building_dimensions()
+
+ # Extract specific measures
+ # Primary wall
+ # Secondary wall
+ # Roof
+ # Floor
+ # Heating system
+ # Hot water system
+ # Windows
+ # Doors
+ # Lighting
+ # Ventilation
+ # Solar
+
+ return self.data
+
+ def extract_walls(self):
+ """
+ Extracts wall type, insulation, dry-lining, and thickness for each building part,
+ including any alternative wall details within the 7.0 Walls section of the summary PDF text.
+ """
+
+ text = self.text
+ wall_data = []
+
+ # Isolate the 7.0 Walls section
+ wall_section_match = re.search(r"7\.0 Walls\n(.*?)\n8\.0 Roofs", text, re.DOTALL)
+ if not wall_section_match:
+ raise ValueError("Failed to locate the walls section in the text.")
+
+ wall_section = wall_section_match.group(1)
+
+ # Define patterns to match walls for each building part
+ wall_pattern = re.compile(
+ r"(?PMain Property(?: Alternative)?|Extension \d+)\s*\n"
+ r"(?:Construction\s*(?P[^\n]*)\n)?"
+ r"(?:Insulation\s*(?P[^\n]*)\n)?"
+ r"(?:Insulation Thickness\(mm\)\s*(?P[^\n]*)\n)?"
+ r"(?:Wall Thickness Measured\?\s*(?P[^\n]*)\n)?"
+ r"(?:Wall Thickness\(mm\)\s*(?P\d+))?",
+ re.MULTILINE
+ )
+
+ # TODO: We aren't effectively picking up alternative walls
+ # alt_wall_pattern = re.compile(
+ # r"Alternative Wall Sheltered\s*.*?\n"
+ # r".*?Construction\s*(?P[^\n]*)\n"
+ # r"Insulation\s*(?P[^\n]*)\n"
+ # r"Insulation Thickness\(mm\)\s*(?P[^\n]*)\n"
+ # r"Wall Thickness Measured\?\s*(?P[^\n]*)\n"
+ # r"Wall Thickness\(mm\)\s*(?P\d+)?",
+ # re.MULTILINE
+ # )
+
+ for match in wall_pattern.finditer(wall_section):
+ building_part = match.group("section")
+ # has_alternative_wall = "Alternative" in building_part
+ building_part = "Main Property" if "Main Property" in building_part else building_part
+
+ wall_entry = {
+ "Building Part": building_part,
+ "Wall Type": match.group("construction") or "Unknown",
+ "Wall Insulation": match.group("insulation") or "Unknown",
+ "Insulation Thickness (mm)": match.group("insulation_thickness") or "Unknown",
+ "Wall Thickness Measured": match.group("thickness_measured") or "Unknown",
+ "Wall Thickness (mm)": int(match.group("thickness")) if match.group("thickness") and match.group(
+ "thickness").isdigit() else None,
+ "Alternative Wall Type": None,
+ "Alternative Wall Insulation": None,
+ "Alternative Insulation Thickness (mm)": None,
+ "Alternative Wall Thickness Measured": None,
+ "Alternative Wall Thickness (mm)": None,
+ }
+
+ # Check if an alternative wall section exists
+ # if has_alternative_wall:
+ # alt_match = alt_wall_pattern.search(wall_section, match.end())
+ # if alt_match:
+ # wall_entry["Alternative Wall Type"] = alt_match.group("alt_construction") or "Unknown"
+ # wall_entry["Alternative Wall Insulation"] = alt_match.group("alt_insulation") or "Unknown"
+ # wall_entry["Alternative Insulation Thickness (mm)"] = alt_match.group(
+ # "alt_insulation_thickness") or "Unknown"
+ # wall_entry["Alternative Wall Thickness Measured"] = alt_match.group(
+ # "alt_thickness_measured") or "Unknown"
+ # wall_entry["Alternative Wall Thickness (mm)"] = int(
+ # alt_match.group("alt_thickness")) if alt_match.group("alt_thickness") and alt_match.group(
+ # "alt_thickness").isdigit() else None
+
+ wall_data.append(wall_entry)
+
+ return wall_data
+
+
+class EPRExtractor:
+ """
+ Extracts space heating, water heating, and address from an Energy Performance Report (EPR).
+ """
+
+ def __init__(self, pdf_text):
+ """
+ Initializes the EPRExtractor with the extracted PDF text.
+ """
+ self.text = pdf_text
+ self.data = {}
+
+ def extract_heating_consumption(self):
+ """
+ Extracts space heating and water heating values from the report.
+ """
+ pattern = re.search(
+ r"Space Heating\(KWH\)\s*([\d,]+).*?\nWater Heating\(KWH\)\s*([\d,]+)",
+ self.text,
+ re.DOTALL
+ )
+
+ if not pattern:
+ raise ValueError("No heating data found in the report")
+
+ self.data.update({
+ "Space Heating (KWH)": int(pattern.group(1).replace(",", "")),
+ "Water Heating (KWH)": int(pattern.group(2).replace(",", ""))
+ })
+
+ def extract_address(self):
+ """
+ Extracts the full address from the report.
+ """
+ pattern = re.search(
+ r"Address\s*(.*?)\nTown\s*(.*?)\n",
+ self.text,
+ re.DOTALL
+ )
+
+ if not pattern:
+ raise ValueError("No address found in the report")
+
+ full_address = pattern.group(1).strip()
+ self.data["Address"] = full_address
+
+ def extract_all(self):
+ """
+ Runs all extraction methods and returns a dictionary with extracted data.
+ """
+ self.extract_address()
+ self.extract_heating_consumption()
+ return self.data
diff --git a/etl/route_march_data_pull/requirements.txt b/survey_report/requirements.txt
similarity index 100%
rename from etl/route_march_data_pull/requirements.txt
rename to survey_report/requirements.txt
diff --git a/survey_report/template.html b/survey_report/template.html
new file mode 100644
index 00000000..5d3b6c63
--- /dev/null
+++ b/survey_report/template.html
@@ -0,0 +1,123 @@
+
+
+
+
+
+ Domna Energy Report
+
+
+
+
+
+
+
+
+
+
+
+
Current EPC Rating
+
${current_epc}
+
SAP ${current_sap}
+
+
+
+
Potential EPC Rating
+
${potential_epc}
+
SAP ${potential_sap}
+
+
+
+
+
+
+