mirror of
https://github.com/Hestia-Homes/Model.git
synced 2026-06-08 11:17:27 +00:00
1046 lines
46 KiB
Python
1046 lines
46 KiB
Python
import hashlib
|
||
import os
|
||
import re
|
||
import tiktoken
|
||
from pprint import pprint
|
||
from datetime import datetime
|
||
from openai import OpenAI
|
||
import numpy as np
|
||
import pandas as pd
|
||
from fuzzywuzzy import process
|
||
from utils.logger import setup_logger
|
||
from backend.SearchEpc import SearchEpc
|
||
from BaseUtility import Definitions
|
||
import asset_list.mappings.property_type as property_type_mappings
|
||
import asset_list.mappings.walls as walls_mappings
|
||
import asset_list.mappings.heating_systems as heating_mappings
|
||
import asset_list.mappings.exising_pv as existing_pv_mappings
|
||
|
||
from recommendations.recommendation_utils import (
|
||
estimate_perimeter,
|
||
estimate_external_wall_area,
|
||
estimate_number_of_floors
|
||
)
|
||
|
||
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes
|
||
|
||
logger = setup_logger()
|
||
|
||
# OpenAI API Key (set this in your environment variables for security)
|
||
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
|
||
|
||
|
||
class DataRemapper:
|
||
def __init__(self, standard_values, standard_map=None, max_tokens=1000):
|
||
"""
|
||
Initialize the remapper with standard values and a predefined mapping.
|
||
|
||
:param standard_values: Set of allowed standardized values.
|
||
:param standard_map: Dictionary of common remappings {raw_value: standard_value}.
|
||
"""
|
||
self.standard_values = standard_values
|
||
self.standard_map = standard_map
|
||
self.fuzzy_threshold = 90 # Adjust fuzzy matching sensitivity
|
||
self.ai_model = "gpt-4-turbo" # Use gpt-3.5-turbo for cheaper processing
|
||
|
||
# Tokenizer for counting tokens
|
||
self.tokenizer = tiktoken.encoding_for_model(self.ai_model)
|
||
|
||
# Track token usage and remap dictionary
|
||
self.total_tokens_used = 0
|
||
self.total_cost = 0
|
||
self.remap_dict = {} # {original_value: standardized_value}
|
||
self.max_tokens = max_tokens # Limit for OpenAI API
|
||
|
||
# Memoization for AI calls
|
||
self.ai_cache = {} # {tuple(unmapped_values): {original_value: standardized_value}}
|
||
# Capture the reponse for debugging
|
||
self.ai_response = None
|
||
|
||
# OpenAI pricing (as of Feb 2024)
|
||
self.pricing = {
|
||
"gpt-4-turbo": {"input": 0.01 / 1000, "output": 0.03 / 1000},
|
||
"gpt-3.5-turbo": {"input": 0.0015 / 1000, "output": 0.002 / 1000},
|
||
}
|
||
|
||
self.openai_client = OpenAI(api_key=OPENAI_API_KEY)
|
||
|
||
@staticmethod
|
||
def clean_string(text):
|
||
"""Basic text cleaning: remove extra spaces, punctuation, and normalize case."""
|
||
if not isinstance(text, str):
|
||
return None
|
||
text = text.strip().lower()
|
||
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
|
||
# Replace double strings
|
||
text = re.sub(r'\s+', ' ', text)
|
||
return text
|
||
|
||
def fuzzy_match(self, text):
|
||
"""Use fuzzy matching to find the closest standard value."""
|
||
match, score = process.extractOne(text, self.standard_values) if text else (None, 0)
|
||
return match if score >= self.fuzzy_threshold else None
|
||
|
||
def count_tokens(self, text):
|
||
"""Estimate the number of tokens in a given text."""
|
||
return len(self.tokenizer.encode(text)) if text else 0
|
||
|
||
def ai_standardize(self, unmapped_values):
|
||
"""Call OpenAI API **once** for all unmapped values to minimize cost, with memoization."""
|
||
if not unmapped_values:
|
||
return {}
|
||
|
||
unmapped_tuple = tuple(sorted(unmapped_values)) # Ensure consistency for memoization
|
||
if unmapped_tuple in self.ai_cache:
|
||
return self.ai_cache[unmapped_tuple] # Return memoized result
|
||
|
||
prompt = f"""
|
||
You are an expert in data classification. Standardize each of these values into one of the categories:
|
||
{list(self.standard_values)}.
|
||
|
||
Return only a JSON dictionary where:
|
||
- The keys are the original values.
|
||
- The values are the standardized ones.
|
||
|
||
Strictly return JSON **without markdown formatting** or extra text.
|
||
|
||
Example Output:
|
||
{{
|
||
"BLKHOUS": "block house",
|
||
"BEDSIT": "bedsit"
|
||
}}
|
||
|
||
Values to standardize:
|
||
{unmapped_values}
|
||
"""
|
||
|
||
# Count input tokens
|
||
input_tokens = self.count_tokens(prompt)
|
||
if input_tokens > self.max_tokens:
|
||
raise ValueError("Input tokens exceed the maximum limit.")
|
||
|
||
logger.info("Calling OpenAI API for standardization...")
|
||
response = self.openai_client.chat.completions.create(
|
||
model=self.ai_model,
|
||
messages=[{"role": "user", "content": prompt}],
|
||
max_tokens=self.max_tokens,
|
||
temperature=0.1,
|
||
)
|
||
|
||
output_text = response.choices[0].message.content.strip()
|
||
output_tokens = self.count_tokens(output_text) # Count output tokens
|
||
|
||
# Track total token usage
|
||
self.total_tokens_used += input_tokens + output_tokens
|
||
|
||
# Estimate cost
|
||
input_cost = input_tokens * self.pricing[self.ai_model]["input"]
|
||
output_cost = output_tokens * self.pricing[self.ai_model]["output"]
|
||
self.total_cost += input_cost + output_cost
|
||
|
||
try:
|
||
# Parse response as dictionary
|
||
mapping = eval(output_text) # OpenAI should return a valid dictionary
|
||
except:
|
||
mapping = {val: "unknown" for val in unmapped_values} # Fallback
|
||
|
||
# Memoize the AI response
|
||
self.ai_cache[unmapped_tuple] = mapping
|
||
# We store the raw AI response for debugging
|
||
logger.debug(f"AI Response: {mapping}")
|
||
self.ai_response = output_text
|
||
|
||
return mapping
|
||
|
||
def standardize_list(self, values_to_remap):
|
||
"""
|
||
Standardizes a list of values and returns a dictionary {original_value: standardized_value}.
|
||
|
||
:param values_to_remap: List of raw values to standardize.
|
||
:return: Dictionary {original_value: standardized_value}.
|
||
"""
|
||
unique_values = set(values_to_remap) # Process only unique values
|
||
|
||
unmapped_values = []
|
||
for value in unique_values:
|
||
if pd.isna(value): # Handle NaN values
|
||
self.remap_dict[value] = "unknown"
|
||
continue
|
||
|
||
cleaned_value = self.clean_string(value)
|
||
|
||
# Rule-Based Check (Predefined Mapping)
|
||
if cleaned_value in self.standard_map or value in self.standard_map:
|
||
self.remap_dict[value] = (
|
||
self.standard_map[cleaned_value] if cleaned_value in self.standard_map else self.standard_map[value]
|
||
)
|
||
continue
|
||
|
||
if value.lower() in self.standard_map:
|
||
self.remap_dict[value] = self.standard_map[value.lower()]
|
||
continue
|
||
|
||
# Exact Match in Standard Values
|
||
if cleaned_value in self.standard_values:
|
||
self.remap_dict[value] = cleaned_value
|
||
continue
|
||
|
||
# Fuzzy Matching
|
||
fuzzy_match = self.fuzzy_match(cleaned_value)
|
||
if fuzzy_match:
|
||
self.remap_dict[value] = fuzzy_match
|
||
continue
|
||
|
||
# Capture anything that wasn't mapped
|
||
unmapped_values.append(value)
|
||
|
||
# AI Model - remap anything unmapped (batch request)
|
||
ai_mapping = self.ai_standardize(unmapped_values)
|
||
self.remap_dict.update(ai_mapping)
|
||
|
||
return self.remap_dict
|
||
|
||
def report_usage(self):
|
||
"""Prints a summary of token usage and cost."""
|
||
print(f"\n🔹 Total Tokens Used: {self.total_tokens_used}")
|
||
print(f"💰 Estimated Cost: ${self.total_cost:.4f}")
|
||
|
||
|
||
class AssetList:
|
||
"""
|
||
This class is used to standardise asset lists so that we can process the core information in a consistent manner.
|
||
"""
|
||
|
||
EPC_API_DATA_NAMES = {
|
||
"uprn": "epc_os_uprn",
|
||
"address1": "epc_address1",
|
||
"address": "epc_address",
|
||
"postcode": "epc_postcode",
|
||
"inspection-date": "epc_inspection_date",
|
||
"current-energy-efficiency": "epc_sap_score_on_register",
|
||
"current-energy-rating": "epc_rating_on_register",
|
||
"property-type": "epc_property_type",
|
||
"built-form": "epc_archetype",
|
||
"total-floor-area": "epc_total_floor_area",
|
||
"construction-age-band": "epc_age_band",
|
||
"floor-height": "epc_floor_height",
|
||
"number-habitable-rooms": "epc_number_habitable_rooms",
|
||
"walls-description": "epc_wall_construction",
|
||
"roof-description": "epc_roof_construction",
|
||
"floor-description": "epc_floor_construction",
|
||
"mainheat-description": "epc_heating_type",
|
||
'mainheatcont-description': "epc_heating_controls",
|
||
"secondheat-description": "epc_secondary_heating",
|
||
"transaction-type": "epc_reason",
|
||
"energy-consumption-current": "epc_heat_demand",
|
||
"photo-supply": "epc_photo_supply"
|
||
}
|
||
FIND_EPC_DATA_NAMES = {
|
||
"heating_text": "epc_estiamted_heating_kwh",
|
||
"hot_water_text": "epc_estimated_hotwater_kwh",
|
||
'Assessor’s name': "epc_assessor_name",
|
||
"Assessor's Telephone": "epc_assessor_telephone",
|
||
"Assessor's Email": "epc_assessor_email",
|
||
"Accreditation scheme": "epc_assessor_accreditation",
|
||
"Assessor’s ID": "epc_assessor_id",
|
||
"Solar photovoltaics": "epc_solar_pv"
|
||
}
|
||
|
||
DATETIME_REMAP = {
|
||
"Pre 1900": datetime(year=1899, month=12, day=31),
|
||
}
|
||
|
||
# These are the accepted methods we have for cleaning the address1 column
|
||
ADDRESS_1_CLEANING_METHODS = [
|
||
"first_two_words", # This method will split on the fist two words, where the separator is a space
|
||
"first_word", # This method will split on the first word, where the separator is a space
|
||
"house_number_extraction", # This method will use the NLP model in SearchEPC to extract the housenumber
|
||
# "address1_extraction" # This method will use the NLP model to extract address1
|
||
]
|
||
|
||
# Standard column Names
|
||
STANDARD_ADDRESS_1 = "domna_address_1"
|
||
STANDARD_POSTCODE = "domna_postcode"
|
||
STANDARD_FULL_ADDRESS = "domna_full_address"
|
||
STANDARD_YEAR_BUILT = "landlord_year_built"
|
||
STANDARD_UPRN = "ordnance_survey_uprn"
|
||
STANDARD_LANDLORD_PROPERTY_ID = "landlord_property_id"
|
||
STANDARD_PROPERTY_TYPE = "landlord_property_type"
|
||
STANDARD_WALL_CONSTRUCTION = "landlord_wall_construction"
|
||
STANDARD_HEATING_SYSTEM = "landlord_heating_system"
|
||
STANDARD_EXISTING_PV = "landlord_existing_pv"
|
||
|
||
DOMNA_PROPERTY_ID = "domna_property_id"
|
||
|
||
# Regular expression for identifying if the address might point to multiple units
|
||
MULTI_UNIT_REGEX = re.compile(r'\b([A-Za-z0-9]+)-([A-Za-z0-9]+)\b')
|
||
|
||
# List of columns relating to the non-intrusive data
|
||
NON_INTRUSIVES_COLNAMES = [
|
||
"Archetype", "Construction", "Insulated", "Material", "CIGA Check Required",
|
||
"PV, ACCESS ISSUE, SEE NOTES", "OFF GAS - ROOF ORIENTATION",
|
||
"Any further surveyor notes", 'Surveyors Name'
|
||
]
|
||
|
||
# This SAP threshold is a key search criteria for properties that may be eligible for extraction
|
||
FILLED_CAVITY_SAP_THRESHOLD = 75
|
||
# This SAP the
|
||
EMPTY_CAVITY_SAP_THRESHOLD = 71
|
||
# Any EPC deemed to have been conducted prior to this year is deemed to be unreliable
|
||
EPC_YEAR_THRESHOLD = pd.Timestamp.now().year - 5
|
||
|
||
# Attributes - these are columns that we produce, calcualted based on other pieces of data
|
||
ATTRIBUTE_HAS_SOLAR = "attribute_has_solar"
|
||
ATTRIBUTE_NUMBER_OF_FLOORS = "attribute_est_number_floors"
|
||
ATTRIBUTE_ESTIMATED_PERIMETER = "attribute_est_perimter"
|
||
ATTRIBUTE_HEAT_LOSS_AREA = "attribute_heat_loss_area"
|
||
ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS = "attribute_epc_roof_insulation_thickness"
|
||
ATTRIBUTE_SAP_THRESHOLD_AND_BELOW = f"sap_rating_{FILLED_CAVITY_SAP_THRESHOLD}_and_below"
|
||
ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD = f"EPC is pre {EPC_YEAR_THRESHOLD}"
|
||
|
||
# These are the descriptions that we look for in the EPC data that are indicative of no insulation
|
||
EPC_NO_WALL_INSULATION_DESCRIPTIONS = [
|
||
"cavity wall, as built, no insulation (assumed)",
|
||
"cavity wall, as built, partial insulation (assumed)",
|
||
"cavity wall, as built, partial insulation",
|
||
"cavity wall, as built, no insulation",
|
||
]
|
||
|
||
# List of strings that we look for in the EPC data, where substrings indicate that the wall is insulated
|
||
EPC_INSULATED_WALLS_SUBSTRINGS = [
|
||
", insulated", "with external insulation", "with internal insulation", "filled cavity"
|
||
]
|
||
|
||
# List of strings that we look for in the EPC data, where substrings indicate that the roof is insulated
|
||
EPC_INSULATED_ROOF_SUBSTRINGS = [
|
||
"(another dwelling above)", ", insulated", ", insulated (assumed) ",
|
||
", ceiling insulated",
|
||
]
|
||
|
||
def __init__(
|
||
self,
|
||
local_filepath,
|
||
sheet_name,
|
||
address1_colname,
|
||
postcode_colname,
|
||
full_address_colname,
|
||
landlord_property_id=None,
|
||
full_address_cols_to_concat=None,
|
||
missing_postcodes_method=None,
|
||
address1_extraction_method=None,
|
||
landlord_year_built=None,
|
||
landlord_uprn=None,
|
||
landlord_property_type=None,
|
||
landlord_wall_construction=None,
|
||
landlord_heating_system=None,
|
||
landlord_existing_pv=None,
|
||
header=0
|
||
):
|
||
self.local_filepath = local_filepath
|
||
self.sheet_name = sheet_name
|
||
# Read in the data
|
||
self.raw_asset_list = pd.read_excel(local_filepath, header=header, sheet_name=sheet_name)
|
||
self.standardised_asset_list = self.raw_asset_list.copy()
|
||
|
||
# We detect the presence of the non-intrusive columns
|
||
self.non_intrusives_present = True if "CIGA Check Required" in self.raw_asset_list.columns else False
|
||
|
||
# Names of columns
|
||
self.landlord_property_id = landlord_property_id
|
||
self.address1_colname = address1_colname
|
||
self.postcode_colname = postcode_colname
|
||
self.full_address_colname = full_address_colname
|
||
self.landlord_year_built = landlord_year_built
|
||
self.landlord_uprn = landlord_uprn
|
||
self.landlord_property_type = landlord_property_type
|
||
self.landlord_wall_construction = landlord_wall_construction
|
||
self.landlord_heating_system = landlord_heating_system
|
||
self.landlord_existing_pv = landlord_existing_pv
|
||
|
||
# parameters for cleaning
|
||
self.full_address_cols_to_concat = full_address_cols_to_concat
|
||
self.missing_postcodes_method = missing_postcodes_method
|
||
self.address1_extraction_method = address1_extraction_method
|
||
|
||
self.debug_information = {
|
||
"property_type": None,
|
||
"wall_construction": None,
|
||
"heating_system": None,
|
||
"existing_pv": None
|
||
}
|
||
|
||
self.variable_mappings = {}
|
||
|
||
self.rename_map = {}
|
||
self.keep_variables = []
|
||
|
||
# Finally, we handle the case where the landlord's property ID is actually the OS UPRN
|
||
if self.landlord_uprn == self.landlord_property_id:
|
||
self.standardised_asset_list[self.STANDARD_UPRN] = self.standardised_asset_list[self.landlord_uprn].copy()
|
||
# Update the reference to landlord UPRn
|
||
self.landlord_uprn = self.STANDARD_UPRN
|
||
|
||
def _extract_address1(self, asset_list, full_address_col, postcode_col, method="first_two_words"):
|
||
|
||
if method not in self.ADDRESS_1_CLEANING_METHODS:
|
||
raise ValueError(f"Method {method} for producing address1 not recognized")
|
||
|
||
if method == "first_two_words":
|
||
asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
|
||
return asset_list
|
||
|
||
if method == "first_word":
|
||
asset_list[self.address1_colname] = asset_list[full_address_col].str.split(" ").str[0]
|
||
return asset_list
|
||
|
||
if method == "house_number_extraction":
|
||
asset_list[self.address1_colname] = asset_list.apply(
|
||
lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]),
|
||
axis=1
|
||
)
|
||
return asset_list
|
||
|
||
raise ValueError(f"Method {method} not recognized")
|
||
|
||
@staticmethod
|
||
def _address1_extraction(x):
|
||
pass
|
||
|
||
def create_property_id(self):
|
||
"""
|
||
This function creates the domna property ID, which is simply a hash of the full address and postcode
|
||
We want all figures to be positive
|
||
:return:
|
||
"""
|
||
|
||
# We'll remove punctuation and whitespace from the address, before hashing to produce an ID
|
||
|
||
def _make_hash(value):
|
||
"""Generates a stable SHA256 hash suffix and appends it to a cleaned version of the value."""
|
||
# Normalize and remove special characters for cleaner ID
|
||
cleaned_value = re.sub(r"[^\w\s-]", "", value).replace(" ", "_").lower()
|
||
|
||
# Generate SHA-256 hash and truncate it
|
||
short_hash = hashlib.sha256(value.encode()).hexdigest()[:12]
|
||
|
||
return f"{cleaned_value}-{short_hash}"
|
||
|
||
# Apply transformation
|
||
self.standardised_asset_list[self.DOMNA_PROPERTY_ID] = (
|
||
self.standardised_asset_list[self.full_address_colname] +
|
||
self.standardised_asset_list[self.postcode_colname]
|
||
).str.strip().str.replace(r"[^\w\s]", "", regex=True).str.replace(" ", "").str.lower().apply(_make_hash)
|
||
|
||
@staticmethod
|
||
def _strip_postcode_from_full_address(full_address, postcode):
|
||
cleaned = full_address.replace(postcode, "")
|
||
# Remove any trailing commas and spaces
|
||
cleaned = cleaned.rstrip(", ").strip(",").strip()
|
||
return cleaned
|
||
|
||
@classmethod
|
||
def _identify_multi_address(cls, address):
|
||
# We check if the address is comma separated
|
||
if "," in address:
|
||
address1_section = address.split(",")[0]
|
||
# We look for string in the form (x-y)
|
||
return bool(cls.MULTI_UNIT_REGEX.search(address1_section))
|
||
|
||
@staticmethod
|
||
def _convert_uprn(x):
|
||
"""
|
||
Used to convert UPRNS to integer strings
|
||
:param x: uprn to convert
|
||
:return: converted uprn
|
||
"""
|
||
|
||
if pd.isnull(x):
|
||
return x
|
||
|
||
# check if numeric
|
||
if np.isreal(x):
|
||
return str(int(x))
|
||
|
||
if str(x).isdigit():
|
||
return str(int(x))
|
||
return x
|
||
|
||
def init_standardise(self):
|
||
"""
|
||
This function is used to standardise the asset list
|
||
:return: standardised asset list
|
||
"""
|
||
|
||
# Remove rows without a postcode
|
||
if self.postcode_colname is not None:
|
||
self.standardised_asset_list = self.standardised_asset_list.dropna(subset=[self.postcode_colname])
|
||
|
||
# We clean up portential non-breaking spaces, and double spaces
|
||
for col in [
|
||
c for c in [self.postcode_colname, self.full_address_colname, self.address1_colname] if
|
||
c is not None
|
||
]:
|
||
self.standardised_asset_list[col] = self.standardised_asset_list[col].astype(str)
|
||
self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace('\xa0', ' ', regex=False)
|
||
self.standardised_asset_list[col] = self.standardised_asset_list[col].str.replace(' ', ' ', regex=False)
|
||
|
||
if self.address1_colname is None:
|
||
if self.address1_extraction_method is None:
|
||
raise ValueError("Missing address 1 - please specify an extraction method")
|
||
self.address1_colname = self.STANDARD_ADDRESS_1
|
||
# If we do not have this, we produce it
|
||
self.standardised_asset_list = self._extract_address1(
|
||
asset_list=self.standardised_asset_list,
|
||
full_address_col=self.full_address_colname,
|
||
postcode_col=self.postcode_colname,
|
||
method=self.address1_extraction_method
|
||
)
|
||
|
||
if self.full_address_colname is None:
|
||
if not self.full_address_cols_to_concat:
|
||
raise ValueError("Missing full address - please specify columns to concatenate")
|
||
self.full_address_colname = self.STANDARD_FULL_ADDRESS
|
||
self.standardised_asset_list[self.full_address_colname] = (
|
||
self.standardised_asset_list[self.full_address_cols_to_concat].apply(lambda x: ", ".join(x), axis=1)
|
||
)
|
||
else:
|
||
|
||
# Make sure to strip the postcode out of the full address
|
||
self.standardised_asset_list[self.full_address_colname] = self.standardised_asset_list.apply(
|
||
lambda x: self._strip_postcode_from_full_address(
|
||
full_address=x[self.full_address_colname],
|
||
postcode=x[self.postcode_colname]
|
||
),
|
||
axis=1
|
||
)
|
||
|
||
# We create the domna property id
|
||
self.create_property_id()
|
||
|
||
# Clean up the UPRN column, if the landlord has provided them
|
||
if self.landlord_uprn is not None:
|
||
self.standardised_asset_list[self.landlord_uprn] = (
|
||
self.standardised_asset_list[self.landlord_uprn].apply(self._convert_uprn)
|
||
)
|
||
|
||
# We keep just the columns we care about and will work through the various columns and standardise
|
||
variables = [
|
||
self.landlord_property_id,
|
||
self.DOMNA_PROPERTY_ID,
|
||
self.address1_colname,
|
||
self.postcode_colname,
|
||
self.full_address_colname,
|
||
self.landlord_uprn,
|
||
self.landlord_property_type,
|
||
self.landlord_year_built,
|
||
self.landlord_wall_construction,
|
||
self.landlord_heating_system,
|
||
self.landlord_existing_pv
|
||
]
|
||
# Keep just non-null variables (e.g landlord may not provide uprn
|
||
self.keep_variables = [v for v in variables if v is not None]
|
||
self.rename_map = {
|
||
self.landlord_property_id: self.STANDARD_LANDLORD_PROPERTY_ID,
|
||
self.address1_colname: self.STANDARD_ADDRESS_1,
|
||
self.postcode_colname: self.STANDARD_POSTCODE,
|
||
self.full_address_colname: self.STANDARD_FULL_ADDRESS,
|
||
self.landlord_uprn: self.STANDARD_UPRN,
|
||
self.landlord_property_type: self.STANDARD_PROPERTY_TYPE,
|
||
self.landlord_year_built: self.STANDARD_YEAR_BUILT,
|
||
self.landlord_wall_construction: self.STANDARD_WALL_CONSTRUCTION,
|
||
self.landlord_heating_system: self.STANDARD_HEATING_SYSTEM,
|
||
self.landlord_existing_pv: self.STANDARD_EXISTING_PV
|
||
}
|
||
self.rename_map = {k: v for k, v in self.rename_map.items() if k is not None}
|
||
|
||
if self.non_intrusives_present:
|
||
self.keep_variables += self.NON_INTRUSIVES_COLNAMES
|
||
self.rename_map = {
|
||
**self.rename_map,
|
||
**dict(
|
||
zip(self.NON_INTRUSIVES_COLNAMES, ["non-intrusives: " + c for c in self.NON_INTRUSIVES_COLNAMES])
|
||
)
|
||
}
|
||
|
||
# We idenfiy addresses which are likely to be multi-addresses (i.g are rooms x-y)
|
||
self.standardised_asset_list["is_multi_address"] = self.standardised_asset_list[
|
||
self.full_address_colname
|
||
].apply(lambda x: self._identify_multi_address(x))
|
||
|
||
# We handle cleaning for walls, in the instance that the landlord provides us with EPC data and
|
||
# we see instances of "average thermal transmittance" in the description
|
||
self.standardised_asset_list[self.landlord_wall_construction] = np.where(
|
||
self.standardised_asset_list[self.landlord_wall_construction].str.lower().str.contains(
|
||
"average thermal transmittance"
|
||
),
|
||
"new build - average thermal transmittance",
|
||
self.standardised_asset_list[self.landlord_wall_construction]
|
||
)
|
||
|
||
# Clear our build year column
|
||
# We attempt to process the year built column
|
||
if self.landlord_year_built is not None:
|
||
# We check if we have a datetime - year built has not been renamed
|
||
if isinstance(self.standardised_asset_list[self.landlord_year_built].iloc[0], datetime):
|
||
# We treat any string columns - with common values we see
|
||
self.standardised_asset_list[self.landlord_year_built] = (
|
||
self.standardised_asset_list[self.landlord_year_built].replace(self.DATETIME_REMAP)
|
||
)
|
||
|
||
self.standardised_asset_list[self.landlord_year_built] = pd.to_datetime(
|
||
self.standardised_asset_list[self.landlord_year_built]
|
||
)
|
||
# Convert this to year
|
||
self.standardised_asset_list[self.landlord_year_built] = (
|
||
self.standardised_asset_list[self.landlord_year_built].dt.year
|
||
)
|
||
else:
|
||
raise NotImplementedError("Year built column must be a datetime - implement me")
|
||
|
||
# We now create standard lookups
|
||
to_remap = {
|
||
self.landlord_property_type: {
|
||
"standard_values": property_type_mappings.STANDARD_PROPERTY_TYPES,
|
||
"standard_map": property_type_mappings.PROPERTY_MAPPING
|
||
},
|
||
self.landlord_wall_construction: {
|
||
"standard_values": walls_mappings.STANDARD_WALL_CONSTRUCTIONS,
|
||
"standard_map": walls_mappings.WALL_CONSTRUCTION_MAPPINGS
|
||
},
|
||
self.landlord_heating_system: {
|
||
"standard_values": heating_mappings.STANDARD_HEATING_SYSTEMS,
|
||
"standard_map": heating_mappings.HEATING_MAPPINGS
|
||
},
|
||
self.landlord_existing_pv: {
|
||
"standard_values": existing_pv_mappings.STANDARD_EXISTING_PV,
|
||
"standard_map": existing_pv_mappings.EXISTING_PV_MAPPINGS
|
||
}
|
||
}
|
||
|
||
for variable, config in to_remap.items():
|
||
logger.info("Standardising variable: %s", variable)
|
||
values_to_remap = self.standardised_asset_list[variable].unique()
|
||
# We want to map this to our standardised list of property types we're interested in
|
||
remapper = DataRemapper(standard_values=config["standard_values"], standard_map=config["standard_map"])
|
||
remap_dictionary = remapper.standardize_list(values_to_remap=values_to_remap.tolist())
|
||
self.variable_mappings[variable] = remap_dictionary
|
||
|
||
# We now print out the variable mappings, which can be reviewed by the user, before the final standardised
|
||
# asset list is returned
|
||
for variable, mapping in self.variable_mappings.items():
|
||
pprint(f"Variable: {variable}")
|
||
pprint(mapping)
|
||
# Print a space
|
||
print("\n")
|
||
pprint("=======================================")
|
||
|
||
def apply_standardiation(self, override_empty_mappings=False):
|
||
"""
|
||
This function applies the standardisation to the asset list
|
||
:param override_empty_mappings: If true, will override the check for empty mappings. This is only relevant
|
||
if there are no categories which need remapping which is highly unlikely
|
||
:return:
|
||
"""
|
||
if not self.variable_mappings and not override_empty_mappings:
|
||
raise ValueError("Please run init_standardise first")
|
||
|
||
logger.info("Applying standardisation to asset list")
|
||
|
||
for variable, mapping in self.variable_mappings.items():
|
||
self.standardised_asset_list[variable] = self.standardised_asset_list[variable].map(mapping)
|
||
|
||
if self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum():
|
||
# Drop the dupes
|
||
pprint(
|
||
f"There are {self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated().sum()} duplicated "
|
||
f"addresses - dropping"
|
||
)
|
||
self.standardised_asset_list = self.standardised_asset_list[
|
||
~self.standardised_asset_list[self.DOMNA_PROPERTY_ID].duplicated()
|
||
]
|
||
|
||
# Apply renames to our standard names
|
||
# Perform final variable selection and renaming:
|
||
self.standardised_asset_list = self.standardised_asset_list[self.keep_variables].rename(
|
||
columns=self.rename_map
|
||
)
|
||
|
||
def merge_data(self, df: pd.DataFrame):
|
||
"""
|
||
Used to insert data into the standardised asset list, based on the domna property id
|
||
:return:
|
||
"""
|
||
if self.DOMNA_PROPERTY_ID not in df.columns:
|
||
raise ValueError(f"Dataframe must contain the column {self.DOMNA_PROPERTY_ID}")
|
||
|
||
if df[self.DOMNA_PROPERTY_ID].duplicated().sum():
|
||
raise ValueError(f"{self.DOMNA_PROPERTY_ID} contains duplicated IDs")
|
||
|
||
self.standardised_asset_list = self.standardised_asset_list.merge(
|
||
df, how="left", on=self.DOMNA_PROPERTY_ID
|
||
)
|
||
|
||
def extract_attributes(self):
|
||
# Used to extracty the typical attributes that we use to identify viable work
|
||
|
||
self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR] = (
|
||
self.standardised_asset_list[self.FIND_EPC_DATA_NAMES["Solar photovoltaics"]] |
|
||
~self.standardised_asset_list[self.EPC_API_DATA_NAMES["photo-supply"]].isin(["0.0", 0, None, ""])
|
||
)
|
||
|
||
accepted_epc_property_types = ["House", "Flat", "Bungalow", "Maisonette"]
|
||
|
||
# The logic here is:
|
||
# 1) Take the property type provided by the HA themselves
|
||
# 2) In absence of that, take the EPC property type
|
||
# 3) Otherwise use None
|
||
self.standardised_asset_list[self.ATTRIBUTE_NUMBER_OF_FLOORS] = self.standardised_asset_list.apply(
|
||
lambda x: estimate_number_of_floors(
|
||
property_type=(
|
||
x[self.STANDARD_PROPERTY_TYPE].title() if
|
||
x[self.STANDARD_PROPERTY_TYPE].title() in accepted_epc_property_types else (
|
||
x[self.EPC_API_DATA_NAMES["property-type"]] if not
|
||
pd.isnull(x[self.EPC_API_DATA_NAMES["property-type"]]) else None
|
||
)
|
||
)
|
||
),
|
||
axis=1
|
||
)
|
||
|
||
self.standardised_asset_list[self.EPC_API_DATA_NAMES["total-floor-area"]] = (
|
||
self.standardised_asset_list[self.EPC_API_DATA_NAMES["total-floor-area"]].astype(float)
|
||
)
|
||
# Replace "" value with None
|
||
self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] = (
|
||
self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]].replace("", None)
|
||
)
|
||
self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] = (
|
||
self.standardised_asset_list[self.EPC_API_DATA_NAMES["number-habitable-rooms"]].astype(float)
|
||
)
|
||
|
||
# Estimate the perimeter
|
||
self.standardised_asset_list[self.ATTRIBUTE_ESTIMATED_PERIMETER] = self.standardised_asset_list.apply(
|
||
lambda x: estimate_perimeter(
|
||
floor_area=x[self.EPC_API_DATA_NAMES["total-floor-area"]] / x[self.ATTRIBUTE_NUMBER_OF_FLOORS],
|
||
num_rooms=x[self.EPC_API_DATA_NAMES["number-habitable-rooms"]] / x[self.ATTRIBUTE_NUMBER_OF_FLOORS],
|
||
), axis=1
|
||
)
|
||
|
||
self.standardised_asset_list[self.ATTRIBUTE_HEAT_LOSS_AREA] = self.standardised_asset_list.apply(
|
||
lambda x: estimate_external_wall_area(
|
||
num_floors=x[self.ATTRIBUTE_NUMBER_OF_FLOORS],
|
||
floor_height=(
|
||
float(x[self.EPC_API_DATA_NAMES["floor-height"]]) if
|
||
x[self.EPC_API_DATA_NAMES["floor-height"]] else 2.5
|
||
),
|
||
perimeter=x[self.ATTRIBUTE_ESTIMATED_PERIMETER],
|
||
built_form=x[self.EPC_API_DATA_NAMES["built-form"]]
|
||
),
|
||
axis=1
|
||
)
|
||
|
||
self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS] = self.standardised_asset_list.apply(
|
||
lambda x: RoofAttributes(description=x[self.EPC_API_DATA_NAMES["roof-description"]]).process()[
|
||
"insulation_thickness"] if not pd.isnull(
|
||
x[self.EPC_API_DATA_NAMES["roof-description"]]) else None,
|
||
axis=1
|
||
)
|
||
|
||
# We produce some additional fields
|
||
# 1) Is the SAP rating below C75
|
||
self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW] = (
|
||
self.standardised_asset_list[self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <=
|
||
self.FILLED_CAVITY_SAP_THRESHOLD
|
||
)
|
||
# 2) Flag anything where the EPC is older than 5 years
|
||
self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD] = (
|
||
pd.to_datetime(
|
||
self.standardised_asset_list[self.EPC_API_DATA_NAMES["inspection-date"]]
|
||
).dt.year < self.EPC_YEAR_THRESHOLD
|
||
)
|
||
|
||
self.process_age_band()
|
||
|
||
def process_age_band(self):
|
||
processed_age_band = []
|
||
for _, x in self.standardised_asset_list.iterrows():
|
||
|
||
if pd.isnull(x[self.EPC_API_DATA_NAMES["construction-age-band"]]) or (
|
||
x[self.EPC_API_DATA_NAMES["construction-age-band"]] in Definitions.DATA_ANOMALY_MATCHES
|
||
):
|
||
processed_age_band.append(
|
||
{
|
||
self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
|
||
"epc_year_lower_bound": None,
|
||
"epc_year_upper_bound": None,
|
||
"Does Age Match EPC Age Band?": "No EPC Age Band"
|
||
}
|
||
)
|
||
continue
|
||
|
||
# We exatract the upper and lower bounds
|
||
if x[self.EPC_API_DATA_NAMES["construction-age-band"]] in [
|
||
"England and Wales: 2007 onwards", "England and Wales: 2012 onwards"
|
||
]:
|
||
year_lower_bound = 2007 if x[self.EPC_API_DATA_NAMES[
|
||
"construction-age-band"]] == "England and Wales: 2007 onwards" else 2012
|
||
|
||
if pd.isnull(x[self.STANDARD_YEAR_BUILT]):
|
||
age_band_matches = "No Year Built From Landlord"
|
||
else:
|
||
age_band_matches = (
|
||
"EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] >= year_lower_bound
|
||
else "EPC Age Band is older than Year Built"
|
||
)
|
||
|
||
processed_age_band.append(
|
||
{
|
||
self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
|
||
"epc_year_lower_bound": year_lower_bound,
|
||
"epc_year_upper_bound": None,
|
||
"Does Age Match EPC Age Band?": age_band_matches
|
||
}
|
||
)
|
||
continue
|
||
|
||
if x[self.EPC_API_DATA_NAMES["construction-age-band"]] == "England and Wales: before 1900":
|
||
|
||
if pd.isnull(x[self.STANDARD_YEAR_BUILT]):
|
||
age_band_matches = "No Year Built From Landlord"
|
||
else:
|
||
age_band_matches = (
|
||
"EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] < 1900
|
||
else "EPC Age Band is newer than Year Built"
|
||
)
|
||
|
||
processed_age_band.append(
|
||
{
|
||
self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
|
||
"epc_year_lower_bound": None,
|
||
"epc_year_upper_bound": 1899,
|
||
"Does Age Match EPC Age Band?": age_band_matches
|
||
}
|
||
)
|
||
continue
|
||
|
||
if x[self.EPC_API_DATA_NAMES["construction-age-band"]].isdigit():
|
||
|
||
if pd.isnull(x[self.STANDARD_YEAR_BUILT]):
|
||
age_band_matches = "No Year Built From Landlord"
|
||
else:
|
||
age_band_matches = (
|
||
"EPC Age Band Matches Year Built" if x[self.STANDARD_YEAR_BUILT] == int(
|
||
x[self.EPC_API_DATA_NAMES["construction-age-band"]]
|
||
)
|
||
else "EPC Age Band is different from Year Built"
|
||
)
|
||
|
||
processed_age_band.append(
|
||
{
|
||
self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
|
||
"epc_year_lower_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]),
|
||
"epc_year_upper_bound": int(x[self.EPC_API_DATA_NAMES["construction-age-band"]]),
|
||
"Does Age Match EPC Age Band?": age_band_matches
|
||
}
|
||
)
|
||
continue
|
||
|
||
# Oherwise, we extract the upper and lower bounds
|
||
age_band = x[self.EPC_API_DATA_NAMES["construction-age-band"]].split(": ")[1]
|
||
lower_date, upper_date = age_band.split("-")
|
||
|
||
age_band_matches = (
|
||
"EPC Age Band Matches Year Built" if (x[self.STANDARD_YEAR_BUILT] >= float(lower_date)) and (
|
||
x[self.STANDARD_YEAR_BUILT] <= float(upper_date)
|
||
)
|
||
else "EPC Age Band is older than Year Built" if x[self.STANDARD_YEAR_BUILT] > float(upper_date)
|
||
else "EPC Age Band is newer than Year Built"
|
||
)
|
||
|
||
processed_age_band.append(
|
||
{
|
||
self.DOMNA_PROPERTY_ID: x[self.DOMNA_PROPERTY_ID],
|
||
"epc_year_lower_bound": int(lower_date),
|
||
"epc_year_upper_bound": int(upper_date),
|
||
"Does Age Match EPC Age Band?": age_band_matches
|
||
}
|
||
)
|
||
|
||
processed_age_band = pd.DataFrame(processed_age_band)
|
||
|
||
self.standardised_asset_list = self.standardised_asset_list.merge(
|
||
processed_age_band, how="left"
|
||
)
|
||
|
||
def identify_worktypes(self, cleaned):
|
||
|
||
if not self.non_intrusives_present:
|
||
raise NotImplementedError("Need to implement the case for non-intrusives")
|
||
|
||
# If we have non-intrusives completed, we can use this to identify work types
|
||
|
||
if self.non_intrusives_present:
|
||
######################################################
|
||
# Empty cavity:
|
||
######################################################
|
||
# 1) Has been flagged on the non-intrusives as being a cavity wall, empty or partially filled
|
||
# 2) The age is before 1995
|
||
# TODO: 3) Remove anything that likley has access issues
|
||
self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] = (
|
||
(~self.standardised_asset_list[self.STANDARD_PROPERTY_TYPE].isin(["bedsit"])) &
|
||
(self.standardised_asset_list['non-intrusives: Construction'] == "CAVITY") &
|
||
self.standardised_asset_list['non-intrusives: Insulated'].isin(["EMPTY", "PARTIAL"]) &
|
||
(self.standardised_asset_list[self.STANDARD_YEAR_BUILT] <= 2000)
|
||
)
|
||
|
||
self.standardised_asset_list["epc_indicates_empty_cavity"] = (
|
||
self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().isin(
|
||
self.EPC_NO_WALL_INSULATION_DESCRIPTIONS
|
||
) & (
|
||
self.standardised_asset_list["epc_year_upper_bound"] <= 1995
|
||
) & (
|
||
~self.standardised_asset_list[self.ATTRIBUTE_EPC_PRE_YEAR_THRESHOLD]
|
||
) & (
|
||
self.standardised_asset_list[
|
||
self.EPC_API_DATA_NAMES["current-energy-efficiency"]] <= self.EMPTY_CAVITY_SAP_THRESHOLD
|
||
)
|
||
)
|
||
|
||
self.standardised_asset_list["empty_cavity"] = (
|
||
self.standardised_asset_list["non_intrusive_indicates_empty_cavity"] |
|
||
self.standardised_asset_list["epc_indicates_empty_cavity"]
|
||
)
|
||
# We add a reason
|
||
self.standardised_asset_list["empty_cavity_reason"] = np.where(
|
||
self.standardised_asset_list["non_intrusive_indicates_empty_cavity"],
|
||
"Non-Intrusive Data",
|
||
"EPC Data"
|
||
)
|
||
|
||
######################################################
|
||
# Extraction
|
||
######################################################
|
||
|
||
# TODO When filterting like this, 627 properties are flagged as not needing a CIGA check and 582 are flagged
|
||
# as needing a CIGA check. What is the logic we should be applying here?
|
||
self.standardised_asset_list["non_intrusive_indicates_cavity_extraction"] = (
|
||
(self.standardised_asset_list["non-intrusives: Construction"] == "CAVITY") &
|
||
(self.standardised_asset_list["non-intrusives: Insulated"].isin(["RETRO DRILLED", "FILLED AT BUILD"])) &
|
||
(~self.standardised_asset_list['non-intrusives: Material'].isin(["GREY LOOSE BEAD", "FORMALDEHYDE"])
|
||
) & (
|
||
self.standardised_asset_list[self.ATTRIBUTE_SAP_THRESHOLD_AND_BELOW]
|
||
)
|
||
)
|
||
|
||
######################################################
|
||
# Solar
|
||
######################################################
|
||
# Criteria:
|
||
|
||
# TODO: Standardise these columns with our cleaned_data object
|
||
|
||
# Check 1: Does the property have a valid heating system?
|
||
self.standardised_asset_list["solar_landlord_data_indicates_correct_heating_system"] = (
|
||
self.standardised_asset_list[self.STANDARD_HEATING_SYSTEM].isin(
|
||
["air source heat pump", "ground source heat pump", "high heat retention storage heaters"]
|
||
)
|
||
)
|
||
|
||
self.standardised_asset_list["solar_epc_data_indicates_correct_heating_system"] = (
|
||
(
|
||
self.standardised_asset_list[self.EPC_API_DATA_NAMES["mainheat-description"]]
|
||
.str.lower().str.contains("air source heat pump|ground source heat pump")
|
||
) | (
|
||
self.standardised_asset_list[
|
||
self.EPC_API_DATA_NAMES["mainheat-description"]].str.lower().str.contains(
|
||
"electric storage heaters"
|
||
) & (
|
||
self.standardised_asset_list[self.EPC_API_DATA_NAMES[
|
||
"mainheatcont-description"]] == "Controls for high heat retention storage heaters"
|
||
)
|
||
)
|
||
)
|
||
|
||
# Check 2: Does the property have solar already
|
||
self.standardised_asset_list["property_has_solar"] = (
|
||
(self.standardised_asset_list[self.STANDARD_EXISTING_PV] == "already has PV") |
|
||
(self.standardised_asset_list["non-intrusives: PV, ACCESS ISSUE, SEE NOTES"] == "SOLAR PV ON ROOF") |
|
||
(self.standardised_asset_list[self.ATTRIBUTE_HAS_SOLAR])
|
||
)
|
||
|
||
# Check 3: Does the property meet the fabric condition
|
||
# Solar PV installs are subject to the minimum insulation requirements which means:
|
||
# 1) one of the following insulation measures must be installed as part of the same
|
||
# ECO4 project:
|
||
# • roof insulation (flat roof, pitched roof, room-in-roof)
|
||
# • exterior facing wall insulation (cavity wall, solid wall)
|
||
# • party cavity wall insulation
|
||
# • floor insulation (solid and underfloor)
|
||
#
|
||
# OR
|
||
#
|
||
# all measures (except any exempted measure referred to in paragraph 4.28)
|
||
# listed in paragraph a) must already be installed
|
||
#
|
||
# With this in mind, we look for 2 clases
|
||
# 1) The property is fully insulated apart from the loft (<200mm insulation)
|
||
# 2) THe property is fully insulated
|
||
|
||
self.standardised_asset_list["solar_landlord_walls_insulated"] = (
|
||
self.standardised_asset_list[self.STANDARD_WALL_CONSTRUCTION].isin(
|
||
["filled cavity", "insulated solid brick"]
|
||
)
|
||
)
|
||
|
||
# TODO: We don't have information about the roof from this landlord
|
||
self.standardised_asset_list["solar_epc_walls_insulated"] = (
|
||
self.standardised_asset_list[self.EPC_API_DATA_NAMES["walls-description"]].str.lower().str.contains(
|
||
"|".join(self.EPC_INSULATED_WALLS_SUBSTRINGS)
|
||
)
|
||
)
|
||
|
||
# We merge on the u-value for average thermal transmittance
|
||
roof_uvalue_data = pd.DataFrame(cleaned["roof-description"])
|
||
roof_uvalue_data = roof_uvalue_data[
|
||
~pd.isnull(roof_uvalue_data["thermal_transmittance"])
|
||
][["original_description", "thermal_transmittance"]].rename(
|
||
columns={
|
||
"original_description": self.EPC_API_DATA_NAMES["roof-description"],
|
||
"thermal_transmittance": "roof_u_value"
|
||
}
|
||
)
|
||
|
||
self.standardised_asset_list = self.standardised_asset_list.merge(
|
||
roof_uvalue_data, how="left", on=self.EPC_API_DATA_NAMES["roof-description"]
|
||
)
|
||
|
||
# If the u-value of a roof is less than 0.7 we consider it insulated
|
||
self.standardised_asset_list["solar_epc_roof_insulated"] = (
|
||
self.standardised_asset_list[self.EPC_API_DATA_NAMES["roof-description"]].str.lower().str.contains(
|
||
"|".join(self.EPC_INSULATED_ROOF_SUBSTRINGS), regex=False
|
||
) | (
|
||
self.standardised_asset_list[self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply(
|
||
lambda x: int(x) >= 270 if str(x).isdigit() else False
|
||
)
|
||
) | (
|
||
self.standardised_asset_list["roof_u_value"].apply(
|
||
lambda x: x <= 0.7 if not pd.isnull(x) else False
|
||
)
|
||
)
|
||
)
|
||
|
||
self.standardised_asset_list["solar_epc_loft_needs_topup"] = self.standardised_asset_list[
|
||
self.ATTRIBUTE_EPC_ROOF_INSULATION_THICKNESS].apply(
|
||
lambda x: int(x) < 270 if str(x).isdigit() else False
|
||
)
|
||
|
||
self.standardised_asset_list["solar_epc_floor_is_solid"] = self.standardised_asset_list[
|
||
self.EPC_API_DATA_NAMES["floor-description"]
|
||
].str.lower().str.contains("solid")
|
||
self.standardised_asset_list["solar_epc_floor_is_solid"] = (
|
||
self.standardised_asset_list["solar_epc_floor_is_solid"].fillna(False)
|
||
)
|
||
|
||
z = self.standardised_asset_list[
|
||
self.standardised_asset_list["solar_epc_floor_is_solid"] == True
|
||
]
|