From 743422e8fec13381c552f177a1caad15cedd7471 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Tue, 11 Jun 2024 18:23:19 +0100
Subject: [PATCH] Parity comparison investiagtion, stonewater wip

---
 backend/Property.py                           |  67 ++++++
 backend/SearchEpc.py                          |  17 +-
 backend/apis/GoogleSolarApi.py                | 211 +++++++++++++++++-
 backend/app/plan/router.py                    |  38 ----
 backend/ml_models/Valuation.py                |   3 +
 etl/customers/goldman/property_ownership.py   |  76 +++++++
 .../northern_gorup/test_asset_list.py         |  43 ++++
 .../places_for_people/parity_comparison.py    | 164 ++++++++++++++
 etl/customers/stonewater/shdf_3_clustering.py |  71 ++++++
 recommendations/Costs.py                      |  30 +--
 recommendations/SolarPvRecommendations.py     |   7 +-
 recommendations/WallRecommendations.py        |   2 +-
 12 files changed, 666 insertions(+), 63 deletions(-)
 create mode 100644 etl/customers/northern_gorup/test_asset_list.py
 create mode 100644 etl/customers/places_for_people/parity_comparison.py

diff --git a/backend/Property.py b/backend/Property.py
index 6336e42d..3599f21b 100644
--- a/backend/Property.py
+++ b/backend/Property.py
@@ -162,6 +162,9 @@ class Property:
         self.current_energy_bill = None
         self.expected_energy_bill = None
 
+        self.heating_energy_source = None
+        self.hot_water_energy_source = None
+
         self.recommendations_scoring_data = []
 
         self.parse_kwargs(kwargs)
@@ -585,6 +588,7 @@ class Property:
             floor_area_decile_thresholds=floor_area_decile_thresholds,
         )
         self.set_energy_source()
+        self.find_energy_sources()
 
     def set_spatial(self, spatial: pd.DataFrame):
         """
@@ -993,3 +997,66 @@ class Property:
 
         # Set the energy source based on the conditions above
         self.energy_source = energy_source
+
+    def find_energy_sources(self):
+        # Based on the heating and the hot water
+        heating_fuel_mapping = {
+            'has_mains_gas': 'Natural Gas',
+            'has_electric': 'Electricity',
+            'has_oil': 'Oil',
+            'has_wood_logs': 'Wood Logs',
+            'has_coal': 'Coal',
+            'has_anthracite': 'Anthracite',
+            'has_smokeless_fuel': 'Smokeless Fuel',
+            'has_lpg': 'LPG',
+            'has_b30k': 'B30K Biofuel',
+            'has_air_source_heat_pump': 'Electricity',
+            'has_ground_source_heat_pump': 'Electricity',
+            'has_water_source_heat_pump': 'Electricity',
+            'has_electric_heat_pump': 'Electricity',
+            'has_solar_assisted_heat_pump': 'Electricity',
+            'has_exhaust_source_heat_pump': 'Electricity',
+            'has_community_heat_pump': 'Electricity',
+            'has_wood_pellets': 'Wood Pellets',
+            'has_community_scheme': 'Varied (Community Scheme)'
+        }
+
+        # Hot water
+        heater_type_to_fuel = {
+            'gas instantaneous': 'Natural Gas',
+            'electric heat pump': 'Electricity',
+            'electric immersion': 'Electricity',
+            'gas boiler': 'Natural Gas',
+            'oil boiler': 'Oil',
+            'electric instantaneous': 'Electricity',
+            'gas multipoint': 'Natural Gas',
+            'heat pump': 'Electricity',
+            'solid fuel boiler': 'Solid Fuel',
+            'solid fuel range cooker': 'Solid Fuel',
+            'room heaters': 'Varied'  # Could be any fuel, further specifics needed based on context
+        }
+
+        # Define a mapping from system types to general categories or modifications of fuel types
+        system_type_modification = {
+            'from main system': 'Main System',
+            'from secondary system': 'Secondary System',
+            'from second main heating system': 'Secondary System',
+            'community scheme': 'Community Scheme'
+        }
+
+        self.heating_energy_source = [
+            fuel for key, fuel in heating_fuel_mapping.items() if self.main_heating.get(key, False)
+        ]
+        if len(self.heating_energy_source) == 0 or len(self.heating_energy_source) > 1:
+            raise Exception("Investigate em")
+
+        self.heating_energy_source = self.heating_energy_source[0]
+
+        if self.hotwater["heater_type"] is not None:
+            self.hot_water_energy_source = heater_type_to_fuel[self.hotwater["heater_type"]]
+        else:
+            fuel = system_type_modification[self.hotwater["system_type"]]
+            if fuel == 'Main System':
+                self.hot_water_energy_source = self.heating_energy_source
+            else:
+                raise Exception("Investiage me")
diff --git a/backend/SearchEpc.py b/backend/SearchEpc.py
index 9724ffd1..275669cc 100644
--- a/backend/SearchEpc.py
+++ b/backend/SearchEpc.py
@@ -434,7 +434,8 @@ class SearchEpc:
         self, initial_postcode: str,
         lmks_to_drop: list[str] | None = None,
         built_form: str = "",
-        property_type: str = ""
+        property_type: str = "",
+        exclude_old: bool = False
     ):
         """
         Fetches and processes EPC data for a given initial postcode, applying successive trimming
@@ -453,6 +454,7 @@ class SearchEpc:
         :param lmks_to_drop: List of 'lmk-key' values to be excluded from the EPC data.
         :param built_form: The 'built-form' value to be used for filtering the EPC data.
         :param property_type: The 'property-type' value to be used for filtering the EPC data.
+        :param exclude_old: Flag to exclude EPC data older than 10 years.
         :return:
         """
 
@@ -483,6 +485,13 @@ class SearchEpc:
                 if not epc_data.empty:
                     # Further processing of the EPC data
                     epc_data['lodgement-datetime'] = pd.to_datetime(epc_data['lodgement-datetime'], errors='coerce')
+
+                    if exclude_old:
+                        # Exclude EPC data older than 10 years
+                        epc_data = epc_data[
+                            epc_data["lodgement-datetime"] > (pd.Timestamp.now() - pd.DateOffset(years=10))
+                            ]
+
                     epc_data = epc_data.sort_values("lodgement-datetime", ascending=False).groupby("uprn").head(1)
                     epc_data["house_number"] = epc_data["address"].apply(lambda add1: self.get_house_number(add1))
                     epc_data["numeric_house_number"] = epc_data["house_number"].apply(
@@ -583,7 +592,8 @@ class SearchEpc:
             initial_postcode=self.postcode,
             lmks_to_drop=lmks_to_drop,
             built_form=built_form,
-            property_type=property_type
+            property_type=property_type,
+            exclude_old=exclude_old
         )
 
         # If we have missing lodgment date, we fill it with inspection-date
@@ -591,9 +601,6 @@ class SearchEpc:
         # If we still have missing dates, we set it to the mean of the non NA dates
         epc_data["lodgement-datetime"] = epc_data["lodgement-datetime"].fillna(epc_data["lodgement-datetime"].mean())
 
-        if exclude_old:
-            epc_data = epc_data[epc_data["lodgement-datetime"] > pd.Timestamp.now() - pd.DateOffset(years=10)]
-
         # For each attribute, we need to determine the datatype and use an appropriate method
         # to estimate.
         estimated_epc = {}
diff --git a/backend/apis/GoogleSolarApi.py b/backend/apis/GoogleSolarApi.py
index 205a3560..8ee7017e 100644
--- a/backend/apis/GoogleSolarApi.py
+++ b/backend/apis/GoogleSolarApi.py
@@ -1,10 +1,15 @@
+import pandas as pd
+
 from backend.Property import Property
 from backend.SearchEpc import SearchEpc
 from etl.epc.Record import EPCRecord
 from dotenv import load_dotenv
-from utils.s3 import read_dataframe_from_s3_parquet
+from utils.s3 import read_dataframe_from_s3_parquet, read_from_s3
 import os
 import requests
+import msgpack
+from functools import lru_cache
+import time
 
 load_dotenv(dotenv_path="backend/.env")
 EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
@@ -13,6 +18,8 @@ EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
 uprn = 100040099104
 # This is for 353A, Hermitage Lane, ME16 9NT (one of the e.on properties)
 uprn = 200000964454
+# This is for 14 Victoria Road, Cross Hills, KEIGHLEY, North Yorkshire, ENGLAND, BD20 8SY
+uprn = 100050346517
 
 cleaning_data = read_dataframe_from_s3_parquet(
     bucket_name="retrofit-data-dev", file_key="sap_change_model/cleaning_dataset.parquet",
@@ -49,6 +56,25 @@ p = Property(
 
 p.get_spatial_data(uprn_filenames)
 
+cleaned = read_from_s3(
+    s3_file_name="cleaned_epc_data/cleaned.bson",
+    bucket_name="retrofit-data-dev"
+)
+
+cleaned = msgpack.unpackb(cleaned, raw=False)
+
+from etl.solar.SolarPhotoSupply import SolarPhotoSupply
+
+photo_supply_lookup, floor_area_decile_thresholds = SolarPhotoSupply.load(bucket="retrofit-data-dev")
+
+p.get_components(
+    cleaned=cleaned,
+    photo_supply_lookup=photo_supply_lookup,
+    floor_area_decile_thresholds=floor_area_decile_thresholds
+)
+p.hot_water_energy_source
+p.heating_energy_source
+
 longitude = p.spatial["longitude"]
 latitude = p.spatial["latitude"]
 
@@ -73,14 +99,29 @@ from pprint import pprint
 
 pprint(solar_potential)
 
+# This is the maximum number of panels that can be installed
+solar_potential["maxArrayPanelsCount"]
+
 # This is the size of the panels used in the calculation - 400 watt
 solar_potential["panelCapacityWatts"]
+
 # Height of the panels used
 solar_potential["panelHeightMeters"]
+
 # Width of the panels used
 solar_potential["panelWidthMeters"]
 
-solar_potential["wholeRoofStats"]
+# This is the maximum area that can be covered by the panels
+solar_potential["maxArrayAreaMeters2"]
+
+# This is the area of the roof
+solar_potential["wholeRoofStats"]["areaMeters2"]
+
+# This is the area of the floor
+solar_potential["wholeRoofStats"]["groundAreaMeters2"]
+
+solar_potential["solarPanelConfigs"][0]
+solar_potential["solarPanelConfigs"][1]
 
 # Copy of response for testing - 6 Laura Close, Tintagel, PL34 0EB
 # {'name': 'buildings/ChIJ2yC6t4KEa0gRh2TIssogI7k', 'center': {'latitude': 50.667375, 'longitude': -4.7416833},
@@ -334,3 +375,169 @@ solar_potential["wholeRoofStats"]
 # 'orientation': 'PORTRAIT', 'yearlyEnergyDcKwh': 278.3281, 'segmentIndex': 1}]}, 'boundingBox': {'sw': {'latitude':
 # 50.6672904, 'longitude': -4.741778}, 'ne': {'latitude': 50.667431199999996, 'longitude': -4.7415536}},
 # 'imageryQuality': 'MEDIUM', 'imageryProcessedDate': {'year': 2024, 'month': 4, 'day': 18}}
+
+
+self = GoogleSolarApi(api_key=api_key)
+import numpy as np
+from recommendations.Costs import MCS_SOLAR_PV_COST_DATA
+
+
+class GoogleSolarApi:
+    NORTH_FACING_AZIMUTH_RANGE = (-30, 30)
+
+    def __init__(self, api_key, max_retries=5):
+        """
+        Initialize the GoogleSolarApi class with the provided API key and maximum retries.
+
+        :param api_key: The API key to authenticate requests to the Google Solar API.
+        :param max_retries: The maximum number of retries for the API request (default is 5).
+        """
+        self.api_key = api_key
+        self.max_retries = max_retries
+        self.base_url = "https://solar.googleapis.com/v1"
+
+        self.insights_data = None
+        self.roof_segments = []
+
+        # property attributes:
+        self.floor_area = None
+        self.roof_area = None
+        self.roof_segment_indexes = None
+        self.panel_area = None
+
+    def get_building_insights(self, longitude, latitude, required_quality="MEDIUM", max_retries=None):
+        """
+        Make an API request to retrieve building insights based on the given longitude and latitude, with retry
+        mechanism.
+
+        :param longitude: The longitude of the location.
+        :param latitude: The latitude of the location.
+        :param required_quality: The required quality of the data (default is "MEDIUM").
+        :param max_retries: The maximum number of retries for the API request (default is None, which uses the
+        instance's max_retries).
+        :return: The JSON response containing the building insights data.
+        """
+        if max_retries is None:
+            max_retries = self.max_retries
+
+        insights_url = f"{self.base_url}/buildingInsights:findClosest"
+        params = {
+            'location.latitude': f'{latitude:.5f}',
+            'location.longitude': f'{longitude:.5f}',
+            'requiredQuality': required_quality,
+            'key': self.api_key
+        }
+
+        attempt = 0
+        while attempt < max_retries:
+            try:
+                response = requests.get(insights_url, params=params)
+                response.raise_for_status()  # Raise an error for bad status codes
+                return response.json()
+            except requests.exceptions.RequestException as e:
+                attempt += 1
+                print(f"Attempt {attempt} failed: {e}")
+                time.sleep(2 ** attempt)  # Exponential backoff
+                if attempt >= max_retries:
+                    raise
+
+    @lru_cache(maxsize=128)
+    def get(self, longitude, latitude, required_quality="MEDIUM"):
+        """
+        Wrapper function that calls get_building_insights and extracts roof segments, with caching.
+
+        :param longitude: The longitude of the location.
+        :param latitude: The latitude of the location.
+        :param required_quality: The required quality of the data (default is "MEDIUM").
+        :return: The JSON response containing the building insights data.
+        """
+
+        # TODO - can we make a request which includes the 30cm buffer from the edge of the roof?
+        self.insights_data = self.get_building_insights(longitude, latitude, required_quality)
+
+        # Extract key data from the insights response
+        self.roof_segments = self.insights_data["solarPotential"].get('roofSegmentStats', [])
+        self.floor_area = self.insights_data["solarPotential"]["wholeRoofStats"]['groundAreaMeters2']
+        self.roof_area = self.insights_data["solarPotential"]["wholeRoofStats"]['areaMeters2']
+        self.panel_area = (
+            self.insights_data["solarPotential"]["panelHeightMeters"] *
+            self.insights_data["solarPotential"]["panelWidthMeters"]
+        )
+
+        # Automatically exclude north-facing segments
+        self.exclude_north_facing_segments()
+
+        self.roof_segment_indexes = [segment['segmentIndex'] for segment in self.roof_segments]
+
+        # We now start finding the solar panel configurations
+        self.optimise_solar_configuration()
+
+    def optimise_solar_configuration(self):
+        """
+        Optimise the solar panel configuration for the building.
+        :return:
+        """
+
+        # Remove any north facing roof segments
+        panel_performance = []
+        for config in self.insights_data["solarPotential"]["solarPanelConfigs"]:
+            roof_segment_summaries = config["roofSegmentSummaries"]
+            # Filter on just the segments in self.roof_segment_indexes
+            roof_segment_summaries = [
+                segment for segment in roof_segment_summaries if segment["segmentIndex"] in self.roof_segment_indexes
+            ]
+
+            roi_summary = []
+            for segment in roof_segment_summaries:
+                wattage = segment["panelsCount"] * self.insights_data["solarPotential"]["panelCapacityWatts"]
+                generated_energy = segment["yearlyEnergyDcKwh"]
+                ratio = generated_energy / wattage
+                cost = MCS_SOLAR_PV_COST_DATA["average_cost_per_kwh"] * (generated_energy / 1000)
+                roi_summary.append(
+                    {
+                        "segmentIndex": segment["segmentIndex"],
+                        "wattage": wattage,
+                        "generatedEnergy": generated_energy,
+                        "ratio": ratio,
+                        "n_panels": segment["panelsCount"],
+                        "cost": cost
+                    }
+                )
+
+            roi_summary = pd.DataFrame(roi_summary)
+
+            weighted_ratio = np.average(
+                roi_summary["ratio"].values, weights=roi_summary["generatedEnergy"].values
+            )
+            total_cost = roi_summary["cost"].sum()
+            total_energy = roi_summary["generatedEnergy"].sum()
+
+            panel_performance.append(
+                {
+                    "n_panels": roi_summary["n_panels"].sum(),
+                    "total_energy": total_energy,
+                    "total_cost": total_cost,
+                    "weighted_ratio": weighted_ratio
+                }
+            )
+
+        panel_performance = pd.DataFrame(panel_performance)
+        panel_performance = panel_performance.sort_values("weighted_ratio", ascending=False)
+
+    def exclude_north_facing_segments(self):
+        """
+        Filter out any north-facing roof segments from the roof_segments attribute.
+
+        North-facing segments are defined as those with an azimuth between -30 and 30 degrees.
+        """
+
+        filtered_segments = []
+        for segment_index, segment in enumerate(self.roof_segments):
+            segment["segmentIndex"] = segment_index
+            # Check if the segment is north-facing
+            if self.NORTH_FACING_AZIMUTH_RANGE[0] <= segment['azimuthDegrees'] <= self.NORTH_FACING_AZIMUTH_RANGE[1]:
+                continue
+
+            filtered_segments.append(segment)
+
+        self.roof_segments = filtered_segments
diff --git a/backend/app/plan/router.py b/backend/app/plan/router.py
index 91a5ce0d..9caab324 100644
--- a/backend/app/plan/router.py
+++ b/backend/app/plan/router.py
@@ -1206,41 +1206,3 @@ def check_mds(results, input_properties, recommendations, optimise_measures):
     hhr_check = pd.DataFrame(hhr_check)
 
     return walls_check, hhr_check
-
-
-from utils.s3 import read_dataframe_from_s3_parquet
-
-z = read_dataframe_from_s3_parquet(
-    bucket_name="retrofit-data-dev",
-    file_key="sap_change_model/2024-05-28-19-08-25/dataset_rooms.parquet"
-)
-
-k = z[z["heat_demand_ending"] != z["heat_demand_starting"]]
-k = k[k["walls_thermal_transmittance"] == k["walls_thermal_transmittance_ending"]]
-k = k[k["roof_thermal_transmittance"] == k["roof_thermal_transmittance_ending"]]
-k = k[k["floor_thermal_transmittance"] == k["floor_thermal_transmittance_ending"]]
-ending_cols = [c for c in k.columns if "_ending" in c]
-eg = k.head(2).tail(1).squeeze()
-
-diff = []
-for c in ending_cols:
-    split = c.split("_ending")[0]
-    if split + "_starting" in k.columns:
-        starting_col = split + "_starting"
-    else:
-        starting_col = split
-
-    b4 = eg[starting_col]
-    after = eg[c]
-    if b4 != after:
-        diff.append(
-            {
-                "measure": split,
-                "starting": b4,
-                "ending": after
-            }
-        )
-diff = pd.DataFrame(diff)
-eg["heat_demand_starting"]
-eg["heat_demand_ending"]
-eg["uprn"]
diff --git a/backend/ml_models/Valuation.py b/backend/ml_models/Valuation.py
index dd77fb4b..1af38194 100644
--- a/backend/ml_models/Valuation.py
+++ b/backend/ml_models/Valuation.py
@@ -90,6 +90,9 @@ class PropertyValuation:
         41222760: 46_000,  # Based on Zoopla
         41222761: 270_000,  # Based on Zoopla
         41212534: 38_000,  # Based on Zoopla
+        # Northern Group Pilot - search by going to https://www.zoopla.co.uk/property/uprn/{uprn}/
+        10070868263: 194_000,  # Based on Zoopla
+        10070868244: 195_000,  # Based on Zoopla
     }
 
     # We base our valuation uplifts on a number of sources
diff --git a/etl/customers/goldman/property_ownership.py b/etl/customers/goldman/property_ownership.py
index 44fa7142..500963a1 100644
--- a/etl/customers/goldman/property_ownership.py
+++ b/etl/customers/goldman/property_ownership.py
@@ -527,3 +527,79 @@ def company_aggregation():
     aggregation = aggregation.sort_values("Number of Properties", ascending=False)
 
     aggregation.to_excel("Company ownership aggregation.xlsx")
+
+
+def prepare_anonymised_data():
+    investment_50m_properties = pd.read_excel("investment_50m_properties 28th May.xlsx", header=0)
+    investment_epc_data = pd.read_excel("portfolio_epc_data_50m 28th May.xlsx", header=0)
+    valuations = pd.read_excel("property value.xlsx", header=0)
+
+    # Merge these datasets
+    df = investment_50m_properties.merge(
+        investment_epc_data[
+            ["UPRN", "PROPERTY_TYPE", "BUILT_FORM", "TOTAL_FLOOR_AREA", "LODGEMENT_DATE", "POSTCODE"]
+        ].rename(
+            columns={
+                "PROPERTY_TYPE": "Property Type",
+                "BUILT_FORM": "Property Archetype",
+                "TOTAL_FLOOR_AREA": "Total Floor Area",
+                "LODGEMENT_DATE": "Date EPC Lodged",
+                "POSTCODE": "Postcode on EPC"
+            }
+        ),
+        how="inner",
+        on="UPRN"
+    ).merge(
+        valuations.drop(columns=["ADDRESS", "POSTCODE"]).rename(
+            columns={
+                "Zoopla Valuation": "Expected Valuation",
+                "Zoopla Lower Bound": "Valuation - Lower Bound",
+                "Zoopla Upper Bound": "Valuation - Upper Bound",
+            }
+        ),
+        how="inner",
+        on="UPRN"
+    ).rename(
+        columns={
+            "CURRENT_ENERGY_RATING": "Current EPC",
+            "CURRENT_ENERGY_EFFICIENCY": "Current SAP Score",
+            "epc_address": "Address on EPC"
+        }
+    ).drop(
+        columns=["Title Number", "match_type", "UPRN"]
+    )
+
+    redacted_owner_names = df[["Company Registration No. (1)"]].drop_duplicates()
+    redacted_owner_names["Owner"] = ["Owner" + str(i) for i in range(1, len(redacted_owner_names) + 1)]
+
+    df = df.merge(
+        redacted_owner_names, how="left", on="Company Registration No. (1)"
+    )
+
+    df = df.drop(columns=["Company Registration No. (1)", "Proprietor Name (1)", "Property Address"])
+    df = df.sort_values(["Owner", "Date EPC Lodged"], ascending=False)
+
+    redacted_index = []
+    for _, owner_properties in df.groupby("Owner"):
+        top_50_percent = round(owner_properties.shape[0] / 2 + 0.00001)
+        indexes = owner_properties.tail(
+            owner_properties.shape[0] - top_50_percent
+        ).index
+
+        redacted_index.extend(indexes.tolist())
+
+    import numpy as np
+    # Redact addresses and postcodes
+    df["Address on EPC"] = np.where(
+        df.index.isin(redacted_index),
+        "Redacted",
+        df["Address on EPC"]
+    )
+
+    df["Postcode on EPC"] = np.where(
+        df.index.isin(redacted_index),
+        "Redacted",
+        df["Postcode on EPC"]
+    )
+
+    df.to_excel("Property List - 50% redacted.xlsx", index=False)
diff --git a/etl/customers/northern_gorup/test_asset_list.py b/etl/customers/northern_gorup/test_asset_list.py
new file mode 100644
index 00000000..46a4bb75
--- /dev/null
+++ b/etl/customers/northern_gorup/test_asset_list.py
@@ -0,0 +1,43 @@
+import pandas as pd
+from utils.s3 import save_csv_to_s3
+
+USER_ID = 8
+PORTFOLIO_ID = 81
+
+
+def app():
+    asset_list = [
+        {
+            'uprn': 10070868263,
+            "address": "Apartment 307, Flint Glass Wharf",
+            "postcode": "M4 6AD",
+        },
+        {
+            'uprn': 10070868244,
+            "address": "Apartment 106, Flint Glass Wharf",
+            "postcode": "M4 6AD",
+        }
+    ]
+
+    asset_list = pd.DataFrame(asset_list)
+
+    # Store the asset list in s3
+    filename = f"{USER_ID}/{PORTFOLIO_ID}/pilot.csv"
+    save_csv_to_s3(
+        dataframe=asset_list,
+        bucket_name="retrofit-plan-inputs-dev",
+        file_name=filename
+    )
+
+    body = {
+        "portfolio_id": str(PORTFOLIO_ID),
+        "housing_type": "Private",
+        "goal": "Increase EPC",
+        "goal_value": "B",
+        "trigger_file_path": filename,
+        "already_installed_file_path": "",
+        "patches_file_path": "",
+        "non_invasive_recommendations_file_path": "",
+        "budget": None,
+    }
+    print(body)
diff --git a/etl/customers/places_for_people/parity_comparison.py b/etl/customers/places_for_people/parity_comparison.py
new file mode 100644
index 00000000..64ab8591
--- /dev/null
+++ b/etl/customers/places_for_people/parity_comparison.py
@@ -0,0 +1,164 @@
+"""
+This script is used to pull together some case studies for the Parity Projects comparison
+"""
+
+import pandas as pd
+from backend.SearchEpc import SearchEpc
+from dotenv import load_dotenv
+import os
+
+load_dotenv("backend/.env")
+EPC_AUTH_TOKEN = os.getenv("EPC_AUTH_TOKEN")
+
+parity_measures = pd.read_excel(
+    "/Users/khalimconn-kowlessar/Documents/hestia/Places For People/Parity Sample All Addresses and Measures.xlsx",
+    sheet_name="Total Measures"
+)
+
+solar_measures = parity_measures[parity_measures["Category"] == "SolarPV"]
+
+example_1 = parity_measures[
+    parity_measures["Address Id (used by website)"] == 6125299
+    ].copy()
+
+config = {
+    "address": "14 Victoria Road",
+    "postcode": "BD20 8SY",
+    "uprn": 100050346517
+}
+
+# Point 1:
+# Parity tends to re-score the EPCs, even if they're extrememly recent.
+# For example for '14, Victoria Road, Cross Hills, KEIGHLEY, North Yorkshire, ENGLAND, BD20 8SY'
+# The most recent EPC was done 15 May 2023, and landed at a 66D, however for some reason, parity re-score this
+# home to be a 63.91. It's unclear why this is done
+
+example_1_measures = example_1[["MeasureGroupName", "Individual SAP increase"]].copy()
+# - LEDS: 0.25 SAP points
+# - 300mm of loft insulation from 200mm: 0.43 SAP points - where is this deduced from? Since the latest survey
+# indicates 250mm insulation in place
+# - Check construction of unknown party wall and fill cavity if appropriate: 0.12 SAP points (highly speculative,
+# not based on any data)
+# - Block open chimneys: 1.61 SAP points - latest survey showed 0 open fireplaces
+# - ASHP (45 degree emitters) with enhanced existing radiator central heating and hot water, from E rated gas boiler
+# 6.38 SAP points
+# - 4kWp PV array south and 30 degree pitch with no shading: 30.24 SAP points
+
+# Notes on solar - 30.34 seems like a lot
+# 400 watt is the solar panel output
+# Let's do a test for this property
+# This would be 10 solar panels
+# Using typical solar panel dimensions, this would be 19.63555m2 of roof space
+# The area of the roof is between 60 - 64.5 m2 (we use a API to get the roof data), implying only
+# around 30% of the roof is covered by solar panels
+# Using our machine learning model to simulate the impact of this on SAP, this would more likely result in
+# a
+
+from utils.s3 import read_dataframe_from_s3_parquet
+
+training_data = read_dataframe_from_s3_parquet(
+    bucket_name="retrofit-data-dev",
+    file_key="sap_change_model/2024-06-09-10-36-53/dataset_rooms.parquet"
+)
+# Look for properties where the only difference is solar
+ending_cols = [
+    c for c in training_data.columns if "_ending" in c and "photo_supply" not in c
+]
+ending_cols = [
+    c for c in ending_cols if
+    c not in ["sap_ending", "heat_demand_ending", "carbon_ending", "transaction_type_ending", "days_to_ending"]
+]
+
+column_pairs = {}
+for col in ending_cols:
+    starting = col.split("_ending")[0]
+    if starting + "_starting" in training_data.columns:
+        starting_col = starting + "_starting"
+    else:
+        starting_col = starting
+
+    column_pairs[col] = starting_col
+
+filtered = training_data.copy()
+# Take rows that had solar installs
+filtered = filtered[filtered["photo_supply_ending"] != filtered["photo_supply_starting"]]
+for ending_col, starting_col in column_pairs.items():
+    filtered = filtered[filtered[ending_col] == filtered[starting_col]]
+    print(f"ending_col: {ending_col}, filtered shape: {filtered.shape}")
+
+avg_change = filtered.groupby("photo_supply_ending")["rdsap_change"].mean().reset_index()
+
+# I've take every single case of there being two EPCs for a property, where the only difference between the first
+# and second is the solar installation. This is 2692 properties, across the UK. In only 4 instances has this resulted in
+# 30 or more SAP points
+
+
+# Some functions based on the SAP methodology:
+import numpy as np
+
+total_floor_area = 50
+occupants = calculate_occupants(total_floor_area)
+appliances_energy_use = estimate_electrical_appliances(occupants, total_floor_area)
+cooking_energy_use = estimate_cooking(occupants)
+
+
+def calculate_occupants(total_floor_area):
+    """
+    From Table 1b
+    :param total_floor_area:
+    :return:
+    """
+    return 1 + (1.76 * (1 - np.exp(-0.000349 * (total_floor_area - 13.9) * (total_floor_area - 13.9))) + 0.0013 * (
+        total_floor_area - 13.9))
+
+
+def estimate_electrical_appliances(occupants, total_floor_area):
+    """
+    From seciont L2 Electrical appliances
+    :param occupants:
+    :param total_floor_area:
+    :return:
+    """
+    e_a = 207.8 * np.power(total_floor_area * occupants, 0.4717)
+
+    days_in_month = {
+        1: 31,
+        2: 28,
+        3: 31,
+        4: 30,
+        5: 31,
+        6: 30,
+        7: 31,
+        8: 31,
+        9: 30,
+        10: 31,
+        11: 30,
+        12: 31
+    }
+
+    eam = 0
+    for m in range(1, 13):
+        nm = days_in_month[m]
+        eam += e_a * (1 + 0.157 * np.cos(2 * np.pi * (m - 1.78) / 12)) * nm / 365
+
+    return eam
+
+
+def estimate_cooking(occupants):
+    """
+    From section L3 Cooking
+    :param occupants:
+    :return:
+    """
+
+    return 35 + 7 * occupants
+
+
+primary_energy_per_m2 = 288  # kWh/m2 per year
+primary_energy_regulated = primary_energy_per_m2 * total_floor_area
+
+primary_energy_factor_electricity = 1.1  # Example factor
+primary_energy_appliances = appliances_energy_use * primary_energy_factor_electricity
+primary_energy_cooking = cooking_energy_use * primary_energy_factor_electricity * 365  # Annualize cooking energy
+
+total_primary_energy_use = primary_energy_regulated + primary_energy_appliances
diff --git a/etl/customers/stonewater/shdf_3_clustering.py b/etl/customers/stonewater/shdf_3_clustering.py
index f2ef9a8b..75917a55 100644
--- a/etl/customers/stonewater/shdf_3_clustering.py
+++ b/etl/customers/stonewater/shdf_3_clustering.py
@@ -593,3 +593,74 @@ def app():
     # "City/Town": "city_town",
     # "County": "county",
     # "Address ID": "external_address_id",
+
+
+def compile_data():
+    """
+    Various data sources have been produced to create the final data source for Stonewater.
+    This function combines them
+    :return:
+    """
+    ########################################################################
+    # Read in data
+    ########################################################################
+    asset_list = read_excel_from_s3(
+        file_key="customers/Stonewater/Stonewater SHDF_3_0_Board Triage 22.05.24.xlsx",
+        bucket_name="retrofit-data-dev",
+        header_row=4
+    )
+
+    # TODO: Read in UPRNs
+
+    ########################################################################
+    # Prepare asset list
+    ########################################################################
+    # TODO: Merge on UPRNs
+    # Drop the bottom 4 rows, which are completely missing
+    asset_list = asset_list.head(-4)
+
+    # Keep just the columns we're interested in
+    asset_list = asset_list[
+        [
+            "Osm. ID",
+            "Org. ref.",
+            "Postcode",
+            "House no",
+            "Name",
+            "Address line 2",
+            "City/Town",
+            "County",
+            "Address ID",  # This is not uprn
+        ]
+    ].rename(
+        columns={
+            "Osm. ID": "internal_id",
+            "Org. ref.": "customer_asset_id",
+            "Postcode": "postcode",
+            "House no": "house_number",
+            "Name": "address1",
+            "Address line 2": "address2",
+            "City/Town": "city_town",
+            "County": "county",
+            "Address ID": "external_address_id",
+        }
+    )
+
+    # Create full address
+    asset_list["full_address"] = np.where(
+        ~pd.isnull(asset_list["address2"]),
+        (
+            asset_list["address1"] + ", " +
+            asset_list["address2"] + ", " +
+            asset_list["city_town"].str.title() + ", " +
+            # asset_list["county"] + ", " +
+            asset_list["postcode"]
+        ),
+        asset_list["address1"] + ", " +
+        asset_list["city_town"].str.title() + ", " +
+        # asset_list["county"] + ", " +
+        asset_list["postcode"]
+    )
+
+    if pd.isnull(asset_list["full_address"]).sum():
+        raise ValueError("Missing full addresses")
diff --git a/recommendations/Costs.py b/recommendations/Costs.py
index 03190727..5f752730 100644
--- a/recommendations/Costs.py
+++ b/recommendations/Costs.py
@@ -20,21 +20,21 @@ regional_labour_variations = [
 
 # This data is based on the MCS database
 MCS_SOLAR_PV_COST_DATA = {
-    "last_updated": "2024-01-04",
-    "average_cost_per_kwh": 2013.94,
-    "average_cost_per_kwh-Outer London": 2618.75,
-    "average_cost_per_kwh-Inner London": 2618.75,
-    "average_cost_per_kwh-South East England": 2083.33,
-    "average_cost_per_kwh-South West England": 2113,
-    "average_cost_per_kwh-East of England": 1973.86,
-    "average_cost_per_kwh-East Midlands": 1981.86,
-    "average_cost_per_kwh-West Midlands": 1926.55,
-    "average_cost_per_kwh-North East England": 2028.49,
-    "average_cost_per_kwh-North West England": 1620.42,
-    "average_cost_per_kwh-Yorkshire and the Humber": 2060.9,
-    "average_cost_per_kwh-Wales": 1898.83,
-    "average_cost_per_kwh-Scotland": 1967.97,
-    "average_cost_per_kwh-Northern Ireland": 2126.09,
+    "last_updated": "2024-06-10",
+    "average_cost_per_kwh": 1750,
+    "average_cost_per_kwh-Outer London": 1776,
+    "average_cost_per_kwh-Inner London": 1776,
+    "average_cost_per_kwh-South East England": 1672,
+    "average_cost_per_kwh-South West England": 1732,
+    "average_cost_per_kwh-East of England": 1721,
+    "average_cost_per_kwh-East Midlands": 1730,
+    "average_cost_per_kwh-West Midlands": 1761,
+    "average_cost_per_kwh-North East England": 1669,
+    "average_cost_per_kwh-North West England": 1764,
+    "average_cost_per_kwh-Yorkshire and the Humber": 1705,
+    "average_cost_per_kwh-Wales": 1896,
+    "average_cost_per_kwh-Scotland": 1767,
+    "average_cost_per_kwh-Northern Ireland": 1767,
 }
 
 # This data is based on the MCS database, We use the larger figure between the 2023 and 2024 average,
diff --git a/recommendations/SolarPvRecommendations.py b/recommendations/SolarPvRecommendations.py
index a9255370..458eae12 100644
--- a/recommendations/SolarPvRecommendations.py
+++ b/recommendations/SolarPvRecommendations.py
@@ -4,10 +4,13 @@ from recommendations.recommendation_utils import override_costs
 
 
 class SolarPvRecommendations:
+    # Solar panel specs based on Eurener 400s solar panels
+    # https://midsummerwholesale.co.uk/buy/eurener/eurener-400w-mepv-zebra-ab-half-cut-mono
     # Approximate area of the solar panels
-    SOLAR_PANEL_AREA = 1.6
+    SOLAR_PANEL_AREA = 1.79
     # Wattage per panel - this is based on the average wattage of a solar panel being between 250w and 420w
-    SOLAR_PANEL_WATTAGE = 250
+    # This was previously set to 250w, but has been upped to 400 based on the systems used by Cotswolrd Energy Group
+    SOLAR_PANEL_WATTAGE = 400
 
     MAX_SYSTEM_WATTAGE = 6000
     MIN_SYSTEM_WATTAGE = 1000
diff --git a/recommendations/WallRecommendations.py b/recommendations/WallRecommendations.py
index 868c08c0..fb228b49 100644
--- a/recommendations/WallRecommendations.py
+++ b/recommendations/WallRecommendations.py
@@ -189,7 +189,7 @@ class WallRecommendations(Definitions):
         # recommend internal wall insulation as a possible measure
 
         u_value = self.property.walls["thermal_transmittance"]
-        u_value = None if math.isnan(u_value) else u_value
+        u_value = None if pd.isnull(u_value) else u_value
 
         is_cavity_wall = self.property.walls["is_cavity_wall"]
         insulation_thickness = self.property.walls["insulation_thickness"]