putting together outputs

2026-07-27 23:35:01 +00:00 · 2025-02-04 14:04:20 +00:00 · 2025-02-04 14:04:20 +00:00 · 139db23592
commit 139db23592
parent 10bc433283
3 changed files with 360 additions and 81 deletions
--- a/etl/customers/stonewater/Wave
+++ b/etl/customers/stonewater/Wave
@ -1,5 +1,6 @@
 import os
-from pyexpat import features
+from urllib import parse
+from fuzzywuzzy import fuzz

 import PyPDF2
 import re
@ -2936,6 +2937,14 @@ def identify_incorrect_packages():
    )


+def extract_sharepoint_url(x):
+    if pd.isnull(x):
+        return ""
+    return "/".join(parse.urlparse(
+        x.split(" - http")[1]
+    ).path.replace("%20", " ").split("/")[-2:])
+
+
 def revised_model():
    """
    This function implements the revised model for Stonewater, where we are looking at new priority postcodes
@ -2956,6 +2965,7 @@ def revised_model():
    original_archetypes = original_archetypes[~pd.isnull(original_archetypes["Address ID"])]
    original_archetypes = original_archetypes[original_archetypes["Address ID"] != "Address ID"]
    original_archetypes["Address ID"] = original_archetypes["Address ID"].astype(int)
+    original_archetypes["UPRN"] = original_archetypes["UPRN"].astype("Int64").astype(str)

    # Check if we have all of the addresses
    missed = original_archetypes[
@ -2965,7 +2975,7 @@ def revised_model():
    assert set(missed) == {'NOT PRIORITY POSTCODE', 'IN WAVE 2.1', 'EPC C OR ABOVE'}

    original_archetypes = original_archetypes[
-        ["Address ID", "Archetype ID", "Archetype Group Rank"]
+        ["Address ID", "Archetype ID", "Archetype Group Rank", "UPRN"]
    ]

    # Merge these archetypes on to the new priority postcodes
@ -3104,6 +3114,42 @@ def revised_model():
    # Replace \n with ""
    retrofit_assessment_data["Postcode"] = retrofit_assessment_data["Postcode"].str.replace("\n", "")

+    retrofit_assessments_data_columns = [
+        'Current SAP Rating', 'Current EPC Band', 'Primary Energy Use (kWh/yr)',
+        'Primary Energy Use Intensity (kWh/m2/yr)', 'Number of Storeys',
+        'Fuel Bill', 'Window Age Description',
+        'Window Age Description Proportion (%)',
+        'Secondary Window Age Description',
+        'Secondary Window Age Description Proportion (%)', 'Number of Windows',
+        'Total Number of Doors', 'Number of Insulated Doors',
+        'Existing Primary Heating System',
+        'Existing Primary Heating PCDF Reference',
+        'Existing Primary Heating Controls',
+        'Existing Primary Heating % of Heat',
+        'Existing Secondary Heating System',
+        'Existing Secondary Heating PCDF Reference',
+        'Existing Secondary Heating Controls',
+        'Existing Secondary Heating % of Heat', 'Secondary Heating Code',
+        'Water Heating Code', 'Total Floor Area (m2)',
+        'Total Ground Floor Area (m2)', 'RIR Floor Area',
+        'Main Building Wall Area (m2)', 'First Extension Wall Area (m2)',
+        'Number of Light Fittings', 'Number of LEL Fittings',
+        'Number of fittings needing LEL', 'Main Roof Type',
+        'Main Roof Insulation', 'Main Roof Insulation Thickness',
+        'Main Wall Type', 'Main Wall Insulation', 'Main Wall Dry-lining',
+        'Main Wall Thickness', 'Main Building Alternative Wall Type',
+        'Main Building Alternative Wall Insulation',
+        'Main Building Alternative Wall Dry-lining',
+        'Main Building Alternative Wall Thickness', 'Main Fuel'
+    ]
+    # For the columns in retrofit_assessments_data_columns, prefix all of them with Survey:
+    retrofit_assessments_data_columns_prefixed = ["Survey: " + x for x in retrofit_assessments_data_columns]
+    rename_dict = dict(zip(retrofit_assessments_data_columns, retrofit_assessments_data_columns_prefixed))
+    retrofit_assessment_data = retrofit_assessment_data.rename(columns=rename_dict)
+    retrofit_assessment_data["Survey: Current EPC Band"] = (
+        retrofit_assessment_data["Survey: Current SAP Rating"].apply(lambda x: sap_to_epc(x))
+    )
+
    # We can read in the data as needed

    # Next Step: Read in the coordinated measures and match to the extracted data
@ -3134,14 +3180,6 @@ def revised_model():
    ccs_coordination_sheet = ccs_coordination_sheet.head(87)
    ccs_coordination = pd.concat([ccs_coordination_removed_from_programme, ccs_coordination_sheet])

-    from urllib import parse
-    def extract_sharepoint_url(x):
-        if pd.isnull(x):
-            return ""
-        return "/".join(parse.urlparse(
-            x.split(" - http")[1]
-        ).path.replace("%20", " ").split("/")[-2:])
-
    ccs_coordination["folder_path"] = ccs_coordination["Sharepoint Link"].apply(lambda x: extract_sharepoint_url(x))

    ############################################################
@ -3224,8 +3262,6 @@ def revised_model():
        lambda x: extract_sharepoint_url(x)
    )

-    # Combine the data back
-
    ############################################################
    # NEW 450 COORDINATED RETROFIT ASSESSMENTS
    #############################################################
@ -3352,7 +3388,6 @@ def revised_model():
    )
    ccs_coordination = ccs_coordination[~pd.isnull(ccs_coordination["Postcode"])]
    ccs_coordination = ccs_coordination[ccs_coordination["Retrofit Assessment"] != "Outstanding"]
-    from fuzzywuzzy import fuzz

    ccs_manual_filters = {
        "35 Kittiwake Close": "Wave 2.1 Surveys/11. CCS Dorset/Kittiwake Close 35"
@ -3596,6 +3631,17 @@ def revised_model():
        matching_lookup, how="left", on="Name"
    )

+    # We now map the retrofit assessment data to the coordinated packages
+    wates_coordination = wates_coordination.merge(
+        retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder"
+    )
+    ccs_coordination = ccs_coordination.merge(
+        retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder"
+    )
+    retrofit_packages_board = retrofit_packages_board.merge(
+        retrofit_assessment_data.drop(columns=["Postcode"]), how="left", on="survey_folder"
+    )
+
    # We have 4 properties in the Wates coordination board, that we want to remove from the retrofit packages board
    to_remove = wates_coordination[
        wates_coordination["Asset ID_x"].astype(int).isin(retrofit_packages_board["Organisation Reference"])
@ -3617,8 +3663,8 @@ def revised_model():
                    'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade',
                    'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls',
                    'Solar PV', 'Other measures', 'Organisation Reference',
-                ]
-            ],
+                ] + retrofit_assessments_data_columns_prefixed
+                ],
            ccs_coordination[
                [
                    # We don't have secondary wall insulation, Flat Roof, RIR, Heating Controls,
@ -3627,8 +3673,8 @@ def revised_model():
                    'SAP Band Install Package', 'Package Approved (Client)',
                    'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade',
                    'Ventilation', 'Heating', 'Other Measures', "Asset ID.1_y",
-                ]
-            ].rename(
+                ] + retrofit_assessments_data_columns_prefixed
+                ].rename(
                columns={
                    "SAP Band Pre": "Actual SAP Band",
                    "SAP Rating Pre": "Actual SAP Rating",
@ -3651,8 +3697,8 @@ def revised_model():
                    'Wall Insulation', 'Loft Insulation', 'Windows Upgrade', 'Ext. Doors Upgrade',
                    'Ventilation', 'Heating', 'Other Measures', 'Asset ID_x'

-                ]
-            ].rename(
+                ] + retrofit_assessments_data_columns_prefixed
+                ].rename(
                columns={
                    "SAP Band Pre": "Actual SAP Band",
                    "SAP Rating Pre": "Actual SAP Rating",
@ -3681,24 +3727,8 @@ def revised_model():
        on="Organisation Reference"
    )

-    # We match the properties to their closest match
-    # We clean up the SAP ratings in the coordinated packages
-    def sap_to_number(x):
-        try:
-            return int(x)
-        except:
-            if x[-1] in ["A", "B", "C", "D", "E", "F"]:
-                return int(x[:-1])
-
-            if x[0] in ["A", "B", "C", "D", "E", "F"]:
-                return int(x[1:])
-
-    coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Actual SAP Band"])]
-    coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Actual SAP Rating"])]
-
-    coordinated_packages["Actual SAP Rating"] = coordinated_packages["Actual SAP Rating"].apply(
-        lambda x: sap_to_number(x)
-    )
+    coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Survey: Current EPC Band"])]
+    coordinated_packages = coordinated_packages[~pd.isnull(coordinated_packages["Survey: Current SAP Rating"])]

    # We need the features pertaining to these priority postcodes

@ -3721,6 +3751,11 @@ def revised_model():
            if not match.empty:
                return match

+        # Finally, we search for a property in the same Archetype
+        match = coordinated_packages[coordinated_packages["Archetype ID"] == home["Archetype ID"]]
+        if not match.empty:
+            return match
+
        return None  # No match found

    coordinated_packages["Postal Region"] = coordinated_packages["Postcode"].str.split(" ").str[0].str.strip()
@ -3732,6 +3767,12 @@ def revised_model():
    coordinated_packages["Primary Property Type"] = coordinated_packages["Property Type"].str.split(":").str[0]
    new_priority_postcodes["Primary Property Type"] = new_priority_postcodes["Property Type"].str.split(":").str[0]

+    coordinated_packages = coordinated_packages.merge(
+        new_priority_postcodes[["Organisation Reference", "Archetype ID"]],
+        how="left",
+        on="Organisation Reference"
+    )
+
    # For every property in the priority postcodes data, we look for a most appropriate matching property
    no_match = []
    matches = []
@ -3759,16 +3800,17 @@ def revised_model():
    no_match_summary = no_match_summary.sort_values("Organisation Reference", ascending=False)

    # len(no_match)
-    # 8764, 5607, 5646
+    # 8764, 5607, 5646, 5071
    # no_match_summary.shape
-    # (3953, 6), (2948, 6), (2969, 7)
+    # (3953, 6), (2948, 6), (2969, 7), (2575, 7)

    matches_df = pd.DataFrame(matches)
    matches_df = matches_df.merge(
-        coordinated_packages[["Organisation Reference", "Actual SAP Band", "Actual SAP Rating"]],
+        coordinated_packages[["Organisation Reference", "Survey: Current EPC Band", "Survey: Current SAP Rating"]],
        left_on="Best Match Organisation Reference", right_on="Organisation Reference",
        suffixes=("", " - Closest Match")
    )
+
    # We want to aggregate the matches, when we have multiple
    aggregated_matches_df = []
    for org_ref, mapped_matches in matches_df.groupby("Organisation Reference"):
@ -3778,19 +3820,21 @@ def revised_model():
                    "Organisation Reference": org_ref,
                    "Number of matches": 1,
                    "Proportion": 100,
-                    "Estimated SAP Rating": mapped_matches["Actual SAP Rating"].values[0],
-                    "Estimated EPC Rating": sap_to_epc(mapped_matches["Actual SAP Rating"].values[0])
+                    "Estimated SAP Rating": mapped_matches["Survey: Current SAP Rating"].values[0],
+                    "Estimated EPC Rating": mapped_matches["Survey: Current EPC Band"].values[0]
                }
            )
            continue

        # We need to aggregate the matches, since we have multiple
-        average_rating = mapped_matches["Actual SAP Rating"].mean()
+        average_rating = mapped_matches["Survey: Current SAP Rating"].mean()
        number_of_matches = mapped_matches.shape[0]
        average_epc_rating = sap_to_epc(average_rating)
        # proportion is the number of properties that have this EPC rating
        proportion_with_this_epc = int(
-            mapped_matches[mapped_matches["Actual SAP Band"] == average_epc_rating].shape[0] / number_of_matches * 100)
+            mapped_matches[mapped_matches["Survey: Current EPC Band"] == average_epc_rating].shape[
+                0] / number_of_matches * 100
+        )
        aggregated_matches_df.append(
            {
                "Organisation Reference": org_ref,
@ -3804,12 +3848,220 @@ def revised_model():
    aggregated_matches_df = pd.DataFrame(aggregated_matches_df)

    mapped_priority_list = new_priority_postcodes.merge(
-        matches_df, on="Organisation Reference",
+        aggregated_matches_df, on="Organisation Reference", how="left"
    )
-    # We merge on the EPC ratings for the matched properties
-    mapped_priority_list = mapped_priority_list.merge(

+    mapped_priority_list["address1"] = mapped_priority_list["Address"].str.split(",").str[0]
+
+    # If we have a leading number like 01, 02, 03, 04, 05, 06, 07, 08, 09, we remove the leading 0
+
+    def remove_leading_zero(address):
+        return re.sub(r"^0([1-9]) ", r"\1 ", address)
+
+    # Example usage
+    mapped_priority_list["address1"] = mapped_priority_list["address1"].apply(remove_leading_zero)
+    mapped_priority_list["address1"] = np.where(
+        mapped_priority_list["Organisation Reference"] == 37004,
+        "8 Mason Road",
+        mapped_priority_list["address1"]
    )
+    mapped_priority_list["address1"] = np.where(
+        mapped_priority_list["Organisation Reference"] == 37003,
+        "9 Mason Road",
+        mapped_priority_list["address1"]
+    )
+
+    mapped_priority_list = mapped_priority_list.rename(
+        columns={"UPRN": "uprn"}
+    )
+    mapped_priority_list["row_id"] = mapped_priority_list["Organisation Reference"]
+
+    # Let's get the newest EPC data for these properties
+    # We merge on UPRN, when we have it
+    # from etl.route_march_data_pull.app import get_data
+    # epc_data, errors, nodata = get_data(
+    #     asset_list=mapped_priority_list,
+    #     fulladdress_column="Address",
+    #     address1_column="address1",
+    #     postcode_column="Postcode",
+    #     manual_uprn_map={},
+    #     epc_api_only=True
+    # )
+    #
+    # epc_df = pd.DataFrame(epc_data)
+    # epc_df.to_csv(
+    #     os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "full_epc_data.csv"), index=False
+    # )
+    epc_df = pd.read_csv(os.path.join(CUSTOMER_FOLDER_PATH, "Jan 2025 Project", "full_epc_data.csv"))
+    epc_df = epc_df.rename(columns={"row_id": "Organisation Reference"})
+
+    # We now package up the data
+
+    # Sheet 1 is the base coordination data
+    output_coordination_sheet = coordinated_packages[
+        [
+            "Name", "Postcode", 'Organisation Reference', 'Package Ref',
+            'Main Wall Insulation', 'Secondary Wall Insulation', 'Loft insulation',
+            'Flat Roof', 'Room in Roof', 'Window Upgrade', 'Door Upgrade',
+            'Ventilation', 'Main Heating', 'Water Heating', 'Heating Controls',
+            'Solar PV', 'Other measures', 'Survey: Current SAP Rating', 'Survey: Current EPC Band',
+            'Survey: Primary Energy Use (kWh/yr)',
+            'Survey: Primary Energy Use Intensity (kWh/m2/yr)',
+            'Survey: Number of Storeys', 'Survey: Fuel Bill',
+            'Survey: Window Age Description',
+            'Survey: Window Age Description Proportion (%)',
+            'Survey: Secondary Window Age Description',
+            'Survey: Secondary Window Age Description Proportion (%)',
+            'Survey: Number of Windows', 'Survey: Total Number of Doors',
+            'Survey: Number of Insulated Doors',
+            'Survey: Existing Primary Heating System',
+            'Survey: Existing Primary Heating PCDF Reference',
+            'Survey: Existing Primary Heating Controls',
+            'Survey: Existing Primary Heating % of Heat',
+            'Survey: Existing Secondary Heating System',
+            'Survey: Existing Secondary Heating PCDF Reference',
+            'Survey: Existing Secondary Heating Controls',
+            'Survey: Existing Secondary Heating % of Heat',
+            'Survey: Secondary Heating Code', 'Survey: Water Heating Code',
+            'Survey: Total Floor Area (m2)', 'Survey: Total Ground Floor Area (m2)',
+            'Survey: RIR Floor Area', 'Survey: Main Building Wall Area (m2)',
+            'Survey: First Extension Wall Area (m2)',
+            'Survey: Number of Light Fittings', 'Survey: Number of LEL Fittings',
+            'Survey: Number of fittings needing LEL', 'Survey: Main Roof Type',
+            'Survey: Main Roof Insulation',
+            'Survey: Main Roof Insulation Thickness', 'Survey: Main Wall Type',
+            'Survey: Main Wall Insulation', 'Survey: Main Wall Dry-lining',
+            'Survey: Main Wall Thickness',
+            'Survey: Main Building Alternative Wall Type',
+            'Survey: Main Building Alternative Wall Insulation',
+            'Survey: Main Building Alternative Wall Dry-lining',
+            'Survey: Main Building Alternative Wall Thickness',
+            'Survey: Main Fuel',
+            'Walls', 'Roofs', 'Heating', 'Main Fuel', 'Age', 'Property Type'
+        ]
+    ].rename(
+        columns={
+            'Walls': "Parity - Walls",
+            'Roofs': "Parity - Roof",
+            'Heating': "Parity - Heating",
+            'Main Fuel': "Parity - Fuel",
+            'Age': "Parity - Age Band",
+            'Property Type': "Parity - Property Type"
+        }
+    )
+
+    # Sheet 2 is the lookup table which maps the properties to their closest match
+    # We need to bring in the parity attributes between the mapped properties so we can see side-by-side
+    mapped_lookup = matches_df[
+        [
+            'Organisation Reference',
+            'Best Match Organisation Reference',
+            'Survey: Current EPC Band',
+            'Survey: Current SAP Rating'
+        ]
+    ].rename(
+        columns={
+            'Best Match Organisation Reference': "Best Match - Organisation Reference",
+            "Survey: Current EPC Band": "Best Match - Survey: Current EPC Band",
+            'Survey: Current SAP Rating': "Best Match - Survey: Current SAp Rating"
+        }
+    ).merge(
+        features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]],
+        how="left",
+        on="Organisation Reference"
+    ).merge(
+        features[["Organisation Reference", "Walls", "Roofs", "Heating", "Main Fuel", "Age", "Property Type"]].rename(
+            columns={
+                "Organisation Reference": "Best Match - Organisation Reference",
+                "Walls": "Best Match - Walls",
+                "Roofs": "Best Match - Roof",
+                "Heating": "Best Match - Heating",
+                "Main Fuel": "Best Match - Main Fuel",
+                "Age": "Best Match - Age",
+                "Property Type": "Best Match - Property Type"
+            }
+        ),
+        how="left",
+        on="Best Match - Organisation Reference"
+    ).merge(
+        coordinated_packages[
+            [
+                "Organisation Reference", 'Survey: Main Wall Type', 'Survey: Main Wall Insulation',
+                'Survey: Main Roof Type', 'Survey: Main Roof Insulation', 'Survey: Main Roof Insulation Thickness',
+                'Survey: Existing Primary Heating System',
+            ]
+        ].rename(
+            columns={
+                "Organisation Reference": "Best Match - Organisation Reference",
+                'Survey: Main Wall Type': 'Best Match - Survey: Main Wall Type',
+                'Survey: Main Wall Insulation': 'Best Match - Survey: Main Wall Insulation',
+                'Survey: Main Roof Type': 'Best Match - Survey: Main Roof Type',
+                'Survey: Main Roof Insulation': 'Best Match - Survey: Main Roof Insulation',
+                'Survey: Main Roof Insulation Thickness': 'Best Match - Survey: Main Roof Insulation Thickness',
+                'Survey: Existing Primary Heating System': 'Best Match - Survey: Existing Primary Heating System',
+            }
+        ),
+        how="left",
+        on="Best Match - Organisation Reference"
+    )
+
+    # Finally, we have the property, against the mapped home with the estimate SAP scores and the EPC data
+    worksheet = mapped_priority_list[
+        [
+            'Organisation Reference', 'Address', 'Postcode', 'Address ID', 'uprn', 'Archetype ID',
+            'SAP', 'SAP Band', "Property Type", "Walls", "Roofs", 'Glazing',
+            'Heating', 'Main Fuel', 'Hot Water', 'Estimated SAP Rating', 'Estimated EPC Rating'
+        ]
+    ].rename(
+        columns={
+            "SAP": "Parity - SAP Rating",
+            "SAP Band": "Parity - EPC Rating",
+            "Property Type": "Parity - Property Type",
+            "Walls": "Parity - Walls",
+            "Roofs": "Parity - Roofs",
+            'Glazing': "Parity - Glazing",
+            'Heating': 'Parity - Heating',
+            'Main Fuel': 'Parity - Main Fuel',
+            'Hot Water': 'Parity - Hot Water',
+        }
+    ).merge(
+        epc_df[
+            [
+                "Organisation Reference",
+                "uprn",
+                "current-energy-efficiency",
+                "current-energy-rating",
+                "lodgement-date",
+                "construction-age-band",
+                "walls-description",
+                "roof-description",
+                "mainheat-description",
+                "windows-description",
+                "hotwater-description",
+                "main-fuel",
+                "total-floor-area",
+            ]
+        ].rename(
+            columns={
+                "uprn": "Last EPC - uprn",
+                "current-energy-efficiency": "Last EPC - SAP Score",
+                "current-energy-rating": "Last EPC - EPC Rating",
+                "lodgement-date": "Last EPC - Date Lodged",
+                "construction-age-band": "Last EPC - Age Band",
+                "walls-description": "Last EPC - Walls",
+                "roof-description": "Last EPC - Roof",
+                "mainheat-description": "Last EPC - Heating",
+                "windows-description": "Last EPC - Windows",
+                "hotwater-description": "Last EPC - Hot Water",
+                "main-fuel": "Last EPC - Main Fuel",
+                "total-floor-area": "Last EPC - Total Floor Area"
+            }
+        ),
+        how="left",
+        on='Organisation Reference'
+    )
+
+    worksheet["Years Since Last EPC"]

 # if __name__ == "__main__":
 #     main()
--- a/etl/route_march_data_pull/app.py
+++ b/etl/route_march_data_pull/app.py
@ -20,7 +20,7 @@ load_dotenv(dotenv_path="backend/.env")
 EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")


-def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map):
+def get_data(asset_list, fulladdress_column, address1_column, postcode_column, manual_uprn_map, epc_api_only=True):
    epc_data = []
    errors = []
    no_epc = []
@ -33,6 +33,11 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column, m
            if house_no is None:
                house_no = house_number
            uprn = manual_uprn_map.get(full_address, None)
+            if uprn is None and home.get("uprn"):
+                uprn = home["uprn"]
+
+            if pd.isnull(uprn):
+                uprn = None

            searcher = SearchEpc(
                address1=str(house_no),
@ -88,6 +93,15 @@ def get_data(asset_list, fulladdress_column, address1_column, postcode_column, m
                no_epc.append(home["row_id"])
                continue

+            if epc_api_only:
+                epc = {
+                    "row_id": home["row_id"],
+                    **searcher.newest_epc.copy()
+                }
+
+                epc_data.append(epc)
+                continue
+
            # Look for EPC recommendatons
            try:
                property_recommendations = searcher.client.domestic.recommendations(searcher.newest_epc["lmk-key"])
--- a/survey_report/app.py
+++ b/survey_report/app.py
@ -1,6 +1,9 @@
 import os
 import PyPDF2
 from string import Template
+
+import pandas as pd
+
 from survey_report.extraction.detect_report_type import detect_report_type
 from survey_report.extraction.quidos import SiteNotesExtractor, EPRExtractor

@ -34,44 +37,54 @@ def handle():
    :return:
    """

-    data_folder = "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2"
+    folders = [
+        "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 1",
+        "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 2",
+        "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 3",
+        "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 4",
+        "/Users/khalimconn-kowlessar/Documents/hestia/Model/survey_report/example_data/Flat 5",
+    ]
+    data = []
+    for data_folder in folders:

-    folder_contents = os.listdir(data_folder)
-    # We look for the following files:
-    # Site notes
-    file_mapping = {}
-    for file in folder_contents:
-        # Check if it's a pdf file
-        if not file.endswith(".pdf"):
-            continue
-        filepath = os.path.join(data_folder, file)
-        with (open(filepath, "rb") as f):
-            pdf = PyPDF2.PdfReader(f)
-            first_page = pdf.pages[0].extract_text()
-            text = ""
-            for page in pdf.pages:
-                text += page.extract_text()
+        folder_contents = os.listdir(data_folder)
+        # We look for the following files:
+        # Site notes
+        file_mapping = {}
+        for file in folder_contents:
+            # Check if it's a pdf file
+            if not file.endswith(".pdf"):
+                continue
+            filepath = os.path.join(data_folder, file)
+            with (open(filepath, "rb") as f):
+                pdf = PyPDF2.PdfReader(f)
+                first_page = pdf.pages[0].extract_text()
+                text = ""
+                for page in pdf.pages:
+                    text += page.extract_text()

-        # Check the report type
-        report_type = detect_report_type(first_page)
-        if report_type is not None:
-            file_mapping[report_type] = text
+            # Check the report type
+            report_type = detect_report_type(first_page)
+            if report_type is not None:
+                file_mapping[report_type] = text

-    # This is only set up to work with quido site notes so we must have it
-    site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"])
-    site_notes = site_notes_extractor.extract_all()
+        # This is only set up to work with quido site notes so we must have it
+        site_notes_extractor = SiteNotesExtractor(file_mapping["quidos_site_notes"])
+        site_notes = site_notes_extractor.extract_all()

-    # We also must have an EPR
-    epr_extractor = EPRExtractor(file_mapping["quidos_epr"])
-    epr = epr_extractor.extract_all()
+        # We also must have an EPR
+        epr_extractor = EPRExtractor(file_mapping["quidos_epr"])
+        epr = epr_extractor.extract_all()

-    # We now produce the combined data sheet which is the starting figure:
-    data_sheet = {**epr, **site_notes}
-    del data_sheet['Building Dimensions']
-    # We unnest the Total Building Dimensions
-    data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"]
-    data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"]
-    del data_sheet["Total Building Dimensions"]
+        # We now produce the combined data sheet which is the starting figure:
+        data_sheet = {**epr, **site_notes}
+        del data_sheet['Building Dimensions']
+        # We unnest the Total Building Dimensions
+        data_sheet["Total Building Floor Area (m2)"] = data_sheet["Total Building Dimensions"]["floor_area"]
+        data_sheet["Total Building Heat Loss Area (m2)"] = data_sheet["Total Building Dimensions"]["heat_loss_area"]
+        del data_sheet["Total Building Dimensions"]
+        data.append(data_sheet)
+    data = pd.DataFrame(data)

    # Generate the HTML report
    # Placeholder locations