setting up code for gla proposal

2026-07-27 23:35:01 +00:00 · 2024-10-10 18:12:29 +01:00 · 2024-10-10 18:12:29 +01:00 · 2d7e9a3cc9
commit 2d7e9a3cc9
parent ecf05369dd
3 changed files with 88 additions and 7 deletions
--- a/etl/customers/gla/init.py
+++ b/etl/customers/gla/init.py
--- a/etl/customers/gla/proposal_investigation.py
+++ b/etl/customers/gla/proposal_investigation.py
@ -0,0 +1,76 @@
+"""
+This script performs some basic analysis to identify EPC data for postcodes specified in the Warmer Homes Local Grant
+"""
+from nis import match
+
+import pandas as pd
+from etl.ownership.Ownership import Ownership
+
+postcodes = pd.read_excel(
+    "/Users/khalimconn-kowlessar/Downloads/WHLG-eligible-postcodes.xlsx", sheet_name='Eligible postcodes'
+)
+# Take just the first two columns
+postcodes = postcodes[
+    ['List of eligible postcodes via the IMD Income Decile 1-2 pathway', 'Unnamed: 1']
+]
+
+postcodes.columns = ['postcode', 'Local Authority']
+# Drop the first row
+postcodes = postcodes.drop([0, 1])
+# Since there are a large number of potcodes (425k), let's just take a few examples
+# Take postcodes that begin with "BN15"
+postcodes = postcodes[postcodes["postcode"].str.startswith("BN15")]
+
+# The Local Authority is Adur, so let's get the EPC data for this area
+# epc_data = pd.read_csv(
+#     "/Users/khalimconn-kowlessar/Documents/hestia/Model/local_data/all-domestic-certificates/domestic-E07000223-Adur"
+#     "/certificates.csv", low_memory=False
+# )
+# # Filter on these postcodes
+# epc_data = epc_data[epc_data["POSTCODE"].str.lower().isin(postcodes["postcode"].str.lower())]
+# epc_data = epc_data[~pd.isnull(epc_data["UPRN"])]
+# # Take the newest EPC for each UPRN, based on LODGEMENT_DATE
+# epc_data["LODGEMENT_DATE"] = pd.to_datetime(epc_data["LODGEMENT_DATE"])
+# epc_data = epc_data.sort_values("LODGEMENT_DATE", ascending=False).drop_duplicates("UPRN")
+#
+# # Let's look at the breakdown of EPC ratings. We want the count and the % of the total
+# ratings_distribution = epc_data.groupby("CURRENT_ENERGY_RATING").size().reset_index()
+# ratings_distribution.columns = ["Rating", "Count"]
+# ratings_distribution["Percentage"] = ratings_distribution["Count"] / ratings_distribution["Count"].sum() * 100
+
+# Can we identify the owners of these units so we can contact them?
+ownership = Ownership(
+    epc_paths=[
+        "/Users/khalimconn-kowlessar/Documents/hestia/Model/local_data/all-domestic-certificates/domestic-E07000223"
+        "-Adur/certificates.csv"
+    ],
+    domestic_ownership_path="/Users/khalimconn-kowlessar/Downloads/CCOD_FULL_2024_07.csv",
+    overseas_ownership_path="/Users/khalimconn-kowlessar/Downloads/OCOD_FULL_2024_07.csv",
+    land_registry_path="/Users/khalimconn-kowlessar/Downloads/pp-complete.csv",
+    project_name="gla-proposal",
+    bucket="retrofit-data-dev",
+    average_property_value=0,
+    portfolio_value=0,
+    excluded_owners=[],
+    excluded_uprns=[],
+    save=False
+)
+
+# Data will be found at ownership/gla-proposal
+ownership.source_epc_properties(column_filters={})
+
+# Step 2: Get company ownership data
+ownership.load_company_ownership()
+
+# Step 3: Prepare data for matching
+ownership.prepare_for_matching()
+
+# Step 4: Match EPC data to ownership data
+ownership.match()
+
+# We have the matches, which we now need to match to the postcodes
+matches = ownership.matched_addresses.copy()
+# filter matches on the postcodes we're interested in
+matches = matches[matches["epc_postcode"].str.lower().isin(postcodes["postcode"].str.lower())]
+# Remove any social transactions
+matches = matches[~matches["TENURE"].isin(["Rented (social)", "rental (social)"])]
--- a/etl/ownership/Ownership.py
+++ b/etl/ownership/Ownership.py
@ -61,6 +61,7 @@ class Ownership:
        portfolio_value: float,
        excluded_owners: List[str] = None,
        excluded_uprns: List[int] = None,
+        save=True
    ):
        """

@ -115,6 +116,8 @@ class Ownership:
            f"ownership/{self.project_name}/{self.run_timestamp}/portfolio_epc_data.xlsx"
        )

+        self.save = save
+
        # Data
        self.epc_data = None
        self.ownership_data = None
@ -210,12 +213,13 @@ class Ownership:
        if self.excluded_uprns:
            self.epc_data = self.epc_data[~self.epc_data["UPRN"].astype(float).isin(self.excluded_uprns)]

-        # We now store the data in s3
-        save_excel_to_s3(
-            df=self.epc_data,
-            bucket_name=self.bucket,
-            file_key=self.epc_data_filepath
-        )
+        if self.save:
+            # We now store the data in s3
+            save_excel_to_s3(
+                df=self.epc_data,
+                bucket_name=self.bucket,
+                file_key=self.epc_data_filepath
+            )

    def load_company_ownership(self):
        """
@ -590,7 +594,8 @@ class Ownership:
                    "CURRENT_ENERGY_RATING",
                    "POSTCODE",
                    "LODGEMENT_DATE",
-                    "TRANSACTION_TYPE"
+                    "TRANSACTION_TYPE",
+                    "TENURE",
                ]
            ].rename(
                columns={