Model/etl/route_march_data_pull/app.py

import os
import time
from BaseUtility import Definitions
import pandas as pd
import numpy as np
from tqdm import tqdm
from datetime import datetime

from dotenv import load_dotenv
from backend.SearchEpc import SearchEpc
from etl.find_my_epc.RetrieveFindMyEpc import RetrieveFindMyEpc
from etl.epc_clean.epc_attributes.RoofAttributes import RoofAttributes

from recommendations.recommendation_utils import (
    estimate_perimeter,
    estimate_external_wall_area,
    estimate_number_of_floors
)

from etl.epc_clean.epc_attributes.attribute_utils import (
    extract_thermal_transmittance
)

load_dotenv(dotenv_path="backend/.env")
EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")


def get_data(
    asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, uprn_column=None,
    epc_api_only=False
):
    epc_data = []
    errors = []
    no_epc = []
    for _, home in tqdm(asset_list.iterrows(), total=len(asset_list)):
        try:
            postcode = home[postcode_column]
            house_number = str(home[address1_column]).strip()
            full_address = home[fulladdress_column].strip()
            house_no = SearchEpc.get_house_number(address=str(house_number), postcode=postcode)
            if house_no is None:
                house_no = house_number
            uprn = manual_uprn_map.get(full_address, None)
            if uprn is None and home.get(uprn_column):
                uprn = home[uprn_column]

            if pd.isnull(uprn):
                uprn = None

            searcher = SearchEpc(
                address1=str(house_no),
                postcode=postcode,
                auth_token=EPC_AUTH_TOKEN,
                os_api_key="",
                property_type=None,
                fast=True,
                full_address=full_address,
                max_retries=5,
                uprn=uprn
            )
            # Force the skipping of estimating the EPC
            searcher.ordnance_survey_client.property_type = None
            searcher.ordnance_survey_client.built_form = None

            searcher.find_property(skip_os=True)

            # Check if we have a flat or appartment
            if searcher.newest_epc is None and uprn is None:
                # Try again:
                if SearchEpc.get_house_number(address=str(house_number), postcode=postcode) is None:
                    # Backup
                    add1 = full_address.split(",")
                    if len(add1) > 1:
                        add1 = add1[1].strip()
                    else:
                        # Try splitting on space
                        add1 = full_address.split(" ")[0].strip()

                else:
                    add1 = str(house_number)
                searcher = SearchEpc(
                    address1=add1,
                    postcode=postcode,
                    auth_token=EPC_AUTH_TOKEN,
                    os_api_key="",
                    property_type=None,
                    fast=True,
                    full_address=full_address,
                    max_retries=5
                )

                if (
                    "flat" in house_number.lower() or "apartment" in house_number.lower() or "apt" in
                    house_number.lower()
                ):
                    searcher.ordnance_survey_client.property_type = "Flat"

                searcher.find_property(skip_os=True)

            if searcher.newest_epc is None:
                no_epc.append(home["row_id"])
                continue

            if epc_api_only:
                epc = {
                    "row_id": home["row_id"],
                    **searcher.newest_epc.copy()
                }

                epc_data.append(epc)
                continue

            # Look for EPC recommendatons
            try:
                property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
            except:
                property_recommendations = {"rows": []}

            # Retrieve data from FindMyEPC
            try:
                find_epc_searcher = RetrieveFindMyEpc(
                    address=searcher.newest_epc["address"], postcode=searcher.newest_epc["postcode"]
                )
                find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
            except ValueError as e:
                if "No EPC found" in str(e) and "address1" in searcher.newest_epc:
                    try:
                        find_epc_searcher = RetrieveFindMyEpc(
                            address=searcher.newest_epc["address1"], postcode=searcher.newest_epc["postcode"]
                        )
                        find_epc_data = find_epc_searcher.retrieve_newest_find_my_epc_data()
                    except ValueError as e:
                        if "No EPC found" in str(e):
                            find_epc_data = {}
                else:
                    find_epc_data = {}
            except Exception as e:
                raise Exception(f"Error retrieving FindMyEPC data: {e}")
            time.sleep(np.random.uniform(0.1, 1))

            epc = {
                "row_id": home["row_id"],
                **searcher.newest_epc.copy(),
                "recommendations": property_recommendations["rows"],
                "find_my_epc_data": find_epc_data,
            }

            epc_data.append(epc)
        except Exception as e:
            errors.append(home["row_id"])
            time.sleep(5)

    return epc_data, errors, no_epc


def extract_address1(asset_list, full_address_col, postcode_col, method="first_two_words"):
    if method == "first_two_words":
        asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[:2].str.join(" ")
        return asset_list

    if method == "first_word":
        asset_list["address1_extracted"] = asset_list[full_address_col].str.split(" ").str[0]
        return asset_list

    if method == "house_number_extraction":
        asset_list["address1_extracted"] = asset_list.apply(
            lambda x: SearchEpc.get_house_number(address=x[full_address_col], postcode=x[postcode_col]),
            axis=1
        )
        return asset_list

    raise ValueError(f"Method {method} not recognized")


def process_age_band(x, year_built_column):
    if isinstance(x[year_built_column], datetime):
        year_built = x[year_built_column].year
    else:
        year_built = float(x[year_built_column])

    if pd.isnull(x["Property Age Band"]) or (
        x["Property Age Band"] in Definitions.DATA_ANOMALY_MATCHES
    ) or pd.isnull(year_built):
        return "No EPC Age Band"

    # We check if we have a numeric data
    if x["Property Age Band"].isdigit():
        if year_built == float(x["Property Age Band"]):
            return "EPC Age Band Matches Year Built"
        if year_built > float(x["Property Age Band"]):
            return "EPC Age Band is older than Year Built"
        if year_built < float(x["Property Age Band"]):
            return "EPC Age Band is newer than Year Built"

    # Handle specific case
    if x["Property Age Band"] == "England and Wales: 2007 onwards":
        if year_built >= 2007:
            return "EPC Age Band Matches Year Built"
        if year_built < 2007:
            return "EPC Age Band is older than Year Built"

    if x["Property Age Band"] == "England and Wales: 2012 onwards":
        if year_built >= 2012:
            return "EPC Age Band Matches Year Built"
        if year_built < 2012:
            return "EPC Age Band is older than Year Built"

    if x["Property Age Band"] == "England and Wales: before 1900":
        if year_built < 1900:
            return "EPC Age Band Matches Year Built"
        if year_built >= 1900:
            return "EPC Age Band is newer than Year Built"

    # Age band will be formatted as such:
    # 'England and Wales: {upper date}-{lower date}'
    # so we extract the lower and upper date
    age_band = x["Property Age Band"].split(": ")[1]
    lower_date, upper_date = age_band.split("-")
    if year_built <= float(upper_date) and year_built >= float(lower_date):
        return "EPC Age Band Matches Year Built"

    if year_built > float(upper_date):
        return "EPC Age Band is older than Year Built"

    if year_built < float(upper_date):
        return "EPC Age Band is newer than Year Built"

    raise Exception("Should not reach here")


def app():
    """
    This app is EPC pulling data for some properties owned by Livewest

    Data request contents:
    Date of last EPC
    Reason for EPC
    SAP score on register
    Property Type
    Property Area
    Property Age
    Any Dimensions (HLP,PW,RH)
    Property Wall Construction
    Heating Type
    Secondary Heating
    Loft Insulation Depth

    Additional if possible:
    Heat loss calculations
    EPC recommendations
    Property UPRN
    """

    # TODO:
    # For cavity work:
    # - Flag any entries that have a different wall type between non-intrusive data against EPC
    # - Worth double checking entries that have a difference in wall construction
    # - Look at anything that is flagged as an empty cavity but the EPC data says it’s a filled cavity
    # - Look at the current EPC scores - Anything that is C75 or above, especially if it’s assumed no insulation
    # - By postcode, we can try and deduce if all of the addresses are a flats and then estimate if 50% of the flats
    # are less than C75
    # - Flag anything pre SAP2012
    # - Flag anything over 5 years old
    # - Look at year built vs age band
    #
    # For Solar:
    # - Discount any that have solar PV - based on non-intrusives and from the inspections team
    # - In the heating, discount anything that isn’t ashp, ghsp, hhrs, electric storage - possibly homes with
    # electric room heaters but it might need to be an EPC E
    # - Fabric - check the floor, wall and roof:
    #     - Filled or empty cavity is good
    #     - Insulated solid/timber/system built is good
    #     - SCIS/CEG needs solid floors
    #     - JJC don’t care
    #     - Anything with a loft 200 or below
    # - Anything C75 and above won’t qualify
    # - Insulated loft = 200mm
    # - We want: fully insulated property (all wall types), EPC D or below (floors should be solid)
    # - Or the insulation required is loft/cavity (floors should be solid)

    # For Westward
    DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Westward"
    DATA_FILENAME = "WESTWARD - completed list..xlsx"
    SHEET_NAME = "Sheet1"
    POSTCODE_COLUMN = "WFT EDIT Postcode"
    FULLADDRESS_COLUMN = "Address"
    ADDRESS1_COLUMN = None
    ADDRESS1_METHOD = "house_number_extraction"
    ADDRESS_COLS_TO_CONCAT = []
    MISSING_POSTCODES_METHOD = None
    PROPERTY_YEAR_BUILT = "Build date"
    UPRN_COLUMN = "UPRN"
    # If we have the non-intrusives data, this should be true
    HAS_NON_INTRUSIVES = True

    # DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Colchester"
    # DATA_FILENAME = "Warmfront data- Colchester Borough Homes (Complete).xlsx"
    # SHEET_NAME = "Sheet1"
    # POSTCODE_COLUMN = 'Full Address.1'
    # FULLADDRESS_COLUMN = "Full Address"
    # ADDRESS1_COLUMN = None
    # ADDRESS1_METHOD = "first_word"
    # ADDRESS_COLS_TO_CONCAT = []
    # MISSING_POSTCODES_METHOD = None
    # PROPERTY_YEAR_BUILT = "Build Date"
    # UPRN_COLUMN = None
    # # If we have the non-intrusives data, this should be true
    # HAS_NON_INTRUSIVES = True

    # Maps addresses to uprn in problematic cases
    MANUAL_UPRN_MAP = {}

    asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME)

    if MISSING_POSTCODES_METHOD is not None:
        if MISSING_POSTCODES_METHOD == "last_two_words":
            # Replace any double spaces
            asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace('  ', ' ', regex=False)
            asset_list["Postcode"] = np.where(
                pd.isnull(asset_list["Postcode"]),
                asset_list[FULLADDRESS_COLUMN].str.split(" ").str[-2:].str.join(" "),
                asset_list["Postcode"]
            )
        else:
            raise ValueError(f"Method {MISSING_POSTCODES_METHOD} not recognized")

    asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index()
    asset_list["row_id"] = asset_list.index

    # We clean up portential non-breaking spaces, and double spaces
    for col in [c for c in [POSTCODE_COLUMN, FULLADDRESS_COLUMN, ADDRESS1_COLUMN] if c is not None]:
        asset_list[col] = asset_list[col].astype(str)
        asset_list[col] = asset_list[col].str.replace('\xa0', ' ', regex=False)
        asset_list[col] = asset_list[col].str.replace('  ', ' ', regex=False)
        asset_list[col] = asset_list[col].str.strip()

    if ADDRESS1_COLUMN is None:
        ADDRESS1_COLUMN = "address1_extracted"
        asset_list = extract_address1(
            asset_list=asset_list,
            full_address_col=FULLADDRESS_COLUMN,
            postcode_col=POSTCODE_COLUMN,
            method=ADDRESS1_METHOD
        )

    if FULLADDRESS_COLUMN is None:
        FULLADDRESS_COLUMN = "fulladdress_extracted"
        # We concatenate the columns in ADDRESS_COLS_TO_CONCAT, on commas
        # Sometimes, some of the columns are empty, so we need to remove them
        asset_list[FULLADDRESS_COLUMN] = asset_list[ADDRESS_COLS_TO_CONCAT].apply(
            lambda x: ", ".join([y for y in x if not pd.isnull(y)]), axis=1
        )

        # We clean up portential non-breaking spaces, and double spaces
        asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].astype(str)
        asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace('\xa0', ' ', regex=False)
        asset_list[FULLADDRESS_COLUMN] = asset_list[FULLADDRESS_COLUMN].str.replace('  ', ' ', regex=False)

    if UPRN_COLUMN is not None:
        # Check if it's numeric and if so, make sure it's an integer
        def convert_uprn(x):

            if pd.isnull(x):
                return x

            # check if numeric
            if np.isreal(x):
                return str(int(x))

            if str(x).isdigit():
                return str(int(x))
            return x

        asset_list[UPRN_COLUMN] = asset_list[UPRN_COLUMN].apply(convert_uprn)

    # We attempt to process the year built column
    if PROPERTY_YEAR_BUILT is not None:
        # We check if we have a datetime
        if isinstance(asset_list[PROPERTY_YEAR_BUILT].iloc[0], datetime):
            # We treat any string columns - with common values we see
            datetime_remap = {
                "Pre 1900": datetime(year=1899, month=12, day=31),
            }
            asset_list[PROPERTY_YEAR_BUILT] = asset_list[PROPERTY_YEAR_BUILT].replace(datetime_remap)

            asset_list[PROPERTY_YEAR_BUILT] = pd.to_datetime(asset_list[PROPERTY_YEAR_BUILT])
            # Convert this to year
            asset_list[PROPERTY_YEAR_BUILT] = asset_list[PROPERTY_YEAR_BUILT].dt.year

    # We check for duplicated addresses
    asset_list["deduper"] = asset_list[FULLADDRESS_COLUMN] + asset_list[POSTCODE_COLUMN]
    if asset_list["deduper"].duplicated().sum():
        # Drop the dupes
        print(f"There are {asset_list['deduper'].duplicated().sum()} duplicated addresses - dropping")
        asset_list = asset_list[~asset_list["deduper"].duplicated()]
    asset_list = asset_list.drop(columns=["deduper"])

    # We chunk up this data into 5000 rows at a time
    # Create the chunks directory
    if not os.path.exists(os.path.join(DATA_FOLDER, "Chunks")):
        os.makedirs(os.path.join(DATA_FOLDER, "Chunks"))
    chunk_size = 5000
    errors = []
    no_epc = []
    skip = None  # Used to skip already completed chunks
    for i in range(0, len(asset_list), chunk_size):
        print(f"Processing chunk {i} to {i + chunk_size}")
        if skip is not None:
            if i <= skip:
                continue
        chunk = asset_list[i:i + chunk_size]
        epc_data_chunk, errors_chunk, no_epc_chunk = get_data(
            asset_list=chunk,
            fulladdress_column=FULLADDRESS_COLUMN,
            address1_column=ADDRESS1_COLUMN,
            postcode_column=POSTCODE_COLUMN,
            manual_uprn_map=MANUAL_UPRN_MAP,
            uprn_column=UPRN_COLUMN
        )

        # We now retrieve any failed properties
        chunk_failed = chunk[chunk["row_id"].isin(errors)]
        epc_data_failed, _, _ = get_data(
            asset_list=chunk_failed,
            fulladdress_column=FULLADDRESS_COLUMN,
            address1_column=ADDRESS1_COLUMN,
            postcode_column=POSTCODE_COLUMN,
            manual_uprn_map=MANUAL_UPRN_MAP,
            epc_api_only=False
        )

        epc_data_chunk.extend(epc_data_failed)
        errors.extend(errors_chunk)
        no_epc.extend(no_epc_chunk)

        # Append the failed data to the main data
        # Store the chunk locally as a csv
        pd.DataFrame(epc_data_chunk).to_csv(os.path.join(DATA_FOLDER, f"Chunks/Chunk {i}.csv"), index=False)

    # We read in and concatenate the created created chunks
    chunks_folder = os.path.join(DATA_FOLDER, "Chunks")
    # List the contents
    chunk_files = os.listdir(chunks_folder)
    epc_data = []
    for file in chunk_files:
        csv_data = pd.read_csv(os.path.join(chunks_folder, file))
        # We need to convert the recommendations back to a list
        csv_data["recommendations"] = csv_data["recommendations"].apply(eval)
        csv_data["find_my_epc_data"] = csv_data["find_my_epc_data"].apply(eval)
        epc_data.append(csv_data)

    epc_df = pd.concat(epc_data)

    # We expand out the recommendations
    recommendations_df = epc_df[["row_id", "recommendations"]]

    unique_recommendations = set()
    for _, row in recommendations_df.iterrows():
        unique_recommendations.update([rec["improvement-summary-text"] for rec in row["recommendations"]])

    columns = ["row_id"] + list(unique_recommendations)
    transformed_data = []
    for _, row in recommendations_df.iterrows():
        # Initialize a dictionary for this row with False for all recommendations
        row_data = {col: False for col in columns}
        row_data["row_id"] = row["row_id"]

        # Set True for each recommendation present in this row
        for rec in row["recommendations"]:
            recommendation_text = rec["improvement-summary-text"]
            row_data[recommendation_text] = True

        # Append the row data to transformed_data
        transformed_data.append(row_data)

    transformed_df = pd.DataFrame(transformed_data)
    # At the moment, we're only using a limited set of columns - let's jut keep cavity wall insulation
    # recommendations
    transformed_df = transformed_df[["row_id", "Cavity wall insulation"]]

    # Get the find my epc data
    find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join(
        pd.json_normalize(epc_df["find_my_epc_data"])
    )
    # We check if we get the solar pv column:
    if "Solar photovoltaics" not in find_my_epc_data.columns:
        find_my_epc_data["Solar photovoltaics"] = False

    # Retrieve just the data we need
    epc_df = epc_df[
        [
            "row_id",
            "uprn",
            "address1",
            "address",
            "postcode",
            "property-type",
            "built-form",
            "inspection-date",
            "current-energy-rating",
            "current-energy-efficiency",
            "roof-description",
            "walls-description",
            "floor-description",
            "transaction-type",
            # New fields needed
            "secondheat-description",
            "total-floor-area",
            "construction-age-band",
            "floor-height",
            "number-habitable-rooms",
            "mainheat-description",
            #
            "energy-consumption-current",  # kwh/m2
            "photo-supply",
        ]
    ].rename(
        columns={"address1": "Address1 on EPC", "address": "Address on EPC", "postcode": "Postcode on EPC"}
    )

    asset_list = asset_list.merge(
        epc_df,
        how="left",
        on="row_id"
    ).merge(
        find_my_epc_data[
            [
                "row_id", "heating_text", "hot_water_text", 'Assessor’s name',
                "Assessor's Telephone", "Assessor's Email", "Accreditation scheme",
                "Assessor’s ID", "Solar photovoltaics"
            ]
        ].rename(
            columns={
                "Solar photovoltaics": "Has Solar PV",
                "heating_text": "Heating Estimated kWh",
                "hot_water_text": "Hot Water Estimated kWh",
            }
        ),
        how="left",
        on="row_id"
    )

    asset_list["Has Solar PV"] = asset_list["Has Solar PV"] | ~asset_list["photo-supply"].isin(["0.0", 0, None, ""])
    asset_list = asset_list.drop(columns=["photo-supply"])

    # Rename the columns
    asset_list = asset_list.rename(columns={
        "inspection-date": "Date of last EPC",
        "current-energy-efficiency": "SAP score on register",
        "current-energy-rating": "EPC rating on register",
        "property-type": "Property Type",
        "built-form": "Archetype - EPC",
        "total-floor-area": "Property Floor Area",
        "construction-age-band": "Property Age Band",
        "floor-height": "Property Floor Height",
        "number-habitable-rooms": "Number of Habitable Rooms",
        "walls-description": "Wall Construction",
        "roof-description": "Roof Construction",
        "floor-description": "Floor Construction",
        "mainheat-description": "Heating Type",
        "secondheat-description": "Secondary Heating",
        "transaction-type": "Reason for last EPC",
        "energy-consumption-current": "Heat Demand (kWh/m2)",
    })

    asset_list["Estimated Number of Floors"] = asset_list.apply(
        lambda x: estimate_number_of_floors(property_type=x["Property Type"]) if not pd.isnull(
            x["Property Type"]) else None, axis=1
    )

    asset_list["Property Floor Area"] = asset_list["Property Floor Area"].astype(float)
    # Replace "" value with None
    asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].replace("", None)
    asset_list["Number of Habitable Rooms"] = asset_list["Number of Habitable Rooms"].astype(float)

    asset_list["Estimated Perimeter (m)"] = asset_list.apply(
        lambda x: estimate_perimeter(
            floor_area=x["Property Floor Area"] / x["Estimated Number of Floors"],
            num_rooms=x["Number of Habitable Rooms"] / x["Estimated Number of Floors"],
        ), axis=1
    )

    asset_list["Estimated Heat Loss Perimeter (m2)"] = asset_list.apply(
        lambda x: estimate_external_wall_area(
            num_floors=x["Estimated Number of Floors"],
            floor_height=float(x["Property Floor Height"]) if x["Property Floor Height"] else 2.5,
            perimeter=x["Estimated Perimeter (m)"],
            built_form=x["Archetype - EPC"]
        ),
        axis=1
    )

    asset_list["Roof Insulation Thickness"] = asset_list.apply(
        lambda x: RoofAttributes(description=x["Roof Construction"]).process()["insulation_thickness"] if not pd.isnull(
            x["Roof Construction"]) else None,
        axis=1
    )

    # We produce some additional fields
    # 1) Is the SAP rating below C75
    asset_list["SAP Rating is 75 and below"] = asset_list["SAP score on register"] <= 75
    # 2) Flag anything where the EPC is older than 5 years
    cutoff_year = pd.Timestamp.now().year - 5
    asset_list[f"EPC is pre {cutoff_year}"] = (
        pd.to_datetime(asset_list["Date of last EPC"]).dt.year < cutoff_year
    )

    # 3) If we have year in the asset list, we flag entries where the built year is different from the
    # EPC Age band
    if PROPERTY_YEAR_BUILT is not None:
        asset_list["Does Age Match EPC Age Band?"] = asset_list.apply(
            lambda x: process_age_band(x, PROPERTY_YEAR_BUILT), axis=1
        )

    if HAS_NON_INTRUSIVES:
        # Empty cavity:
        # 1) Has been flagged on the non-intrusives as being empty or partially filled
        # 2) The age is before 1995
        # 3) Remove anything that likley has access issues
        asset_list["Suitable for Cavity Fill"] = (
            (asset_list["Construction"] == "CAVITY") &
            asset_list["Insulated"].isin(["EMPTY", "PARTIAL"]) &
            (
                (asset_list[PROPERTY_YEAR_BUILT] <= 1995) # TODO, Or if the EPC age band is < 1995
            )
        )

        # asset_list["Suitable for Extraction"] =
        asset_list[
            (asset_list["Construction"] == "Cavity") &
            asset_list["Insulated"].isin(["RETRO DRILLED"]) &
            (
                (asset_list[PROPERTY_YEAR_BUILT] <= 1995)
            ) &
            (
                asset_list[]
            )
        ]

    # 4) Flag properties that look like they're good candidates for solar installs
    # Firstly, flag if the fabric is completely done

    insulated_wall_substrings = [
        ", insulated", "with external insulation", "with internal insulation", "filled cavity"
    ]

    insulated_roof_substrings = [
        "(another dwelling above)", "limited insulation", "(other premises above)",
        ", no insulation",
    ]

    def check_solar_insulation_conditions(x):

        if pd.isnull(x["Wall Construction"]):
            return None

        if "average thermal transmittance" in x["Wall Construction"].lower():
            # We extract out the u-values
            wall_uvalue = extract_thermal_transmittance({}, x["Wall Construction"])[0]["thermal_transmittance"]
            roof_uvalue = extract_thermal_transmittance({}, x["Roof Construction"])[0]["thermal_transmittance"]
            floor_uvalue = extract_thermal_transmittance({}, x["Floor Construction"])[0]["thermal_transmittance"]

            roof_uvalue = 0 if roof_uvalue is None else roof_uvalue
            floor_uvalue = 0 if floor_uvalue is None else floor_uvalue

            # We apply some cutoffs
            if wall_uvalue < 0.7 and roof_uvalue < 0.7 and floor_uvalue < 0.7:
                return "Walls, Roof and Floor have U-values below 0.7"

            return "Confirm U-values"

        walls_insulated = any(
            insulated_substring in x["Wall Construction"].lower() for insulated_substring in insulated_wall_substrings
        )
        roof_is_numeric = False
        if str(x["Roof Insulation Thickness"]).isdigit():
            roof_is_numeric = True
            roof_insulated = int(x["Roof Insulation Thickness"]) >= 200
        else:
            roof_insulated = any(
                insulated_substring in x["Roof Construction"].lower() for insulated_substring in
                insulated_roof_substrings
            )

        floor_is_solid = "solid" in x["Floor Construction"].lower()

        if walls_insulated and roof_insulated and floor_is_solid:
            return "Walls Insulated, Roof Insulated, Floor Solid"

        if walls_insulated and floor_is_solid and roof_is_numeric:
            return "Walls Insulated, Floor Solid, Loft need top-up"

        return "Not Fully Insulated or no data"

    asset_list["Solar Fabric Condition"] = asset_list.apply(check_solar_insulation_conditions, axis=1)

    asset_list["Good Solar Candidate"] = (
        asset_list["SAP Rating is 75 and below"] &
        ~asset_list["Has Solar PV"] &
        (
            asset_list["Heating Type"].isin(
                [
                    "Electric storage heaters",
                    "Room heaters, electric",
                ]
            ) | asset_list["Heating Type"].str.contains("heat pump", case=False)
        ) & (
            asset_list["Solar Fabric Condition"].isin(
                [
                    "Walls Insulated, Roof Insulated, Floor Solid",
                    "Walls, Roof and Floor have U-values below 0.7",
                    "Walls Insulated, Floor Solid, Loft need top-up"
                ]
            )
        )
    )

    def flat_analysis(asset_list):

        # We need to deduce the building name - we strip out the house number
        def extract_building_name(x):
            # TODO: This doesn't really work
            if pd.isnull(x):
                return None
            house_no = SearchEpc.get_house_number(address=x, postcode=None)
            if house_no:
                return x.replace(house_no, "").strip()
            return x.split(",")[0].strip()

        # We want to deduce if flats have 50% of the properties below C75
        # We group by postcode and property type
        grouped = asset_list.groupby([POSTCODE_COLUMN, "Property Type"])

        flat_data = []
        for _, group in grouped:
            if "flat" in group["Property Type"].str.lower().values:
                num_flats = group["Property Type"].str.lower().value_counts().get("flat", 0)
                num_below_c75 = group["SAP score on register"].lt(75).sum()

                flat_data.append(
                    {
                        "Postcode": group[POSTCODE_COLUMN].iloc[0],
                        "Property Type": "Flat",
                        "Number of Flats with EPC": num_flats,
                        "Number of Flats below C75": num_below_c75,
                        "Proportion of Flat EPCs below C75": round(100 * num_below_c75 / num_flats)
                    }
                )

        flat_data = pd.DataFrame(flat_data)

        return flat_data

    flat_data = flat_analysis(asset_list)

    # For all of the columns in transformed_df, prefix with "Recommendation: "
    for col in transformed_df.columns:
        if col == "row_id":
            continue
        transformed_df = transformed_df.rename(columns={col: f"Recommendation: {col}"})

    asset_list = asset_list.merge(
        transformed_df,
        how="left",
        on="row_id"
    )
    asset_list = asset_list.drop(columns=["row_id", "index"])

    # Store as an excel
    filename = os.path.join(DATA_FOLDER, ".".join(DATA_FILENAME.split(".")[:-1])) + " EPC Data Pull.xlsx"
    # Store the data in two tabs. One for the asset list with the EPC data and the second with the flat data

    with pd.ExcelWriter(filename) as writer:
        asset_list.to_excel(writer, sheet_name="EPC Data", index=False)
        flat_data.to_excel(writer, sheet_name="Flat Data", index=False)

    matches_review = asset_list[
        [FULLADDRESS_COLUMN, ADDRESS1_COLUMN, POSTCODE_COLUMN, "Address on EPC", "Postcode on EPC"]
    ]