allow postcode filtering

2026-06-08 11:17:27 +00:00 · 2024-10-10 18:51:12 +01:00 · 2024-10-10 18:51:12 +01:00 · f53ce8b430
commit f53ce8b430
parent a953a1f0ee
2 changed files with 21 additions and 12 deletions
--- a/etl/customers/gla/proposal_investigation.py
+++ b/etl/customers/gla/proposal_investigation.py
@ -1,9 +1,12 @@
 """
 This script performs some basic analysis to identify EPC data for postcodes specified in the Warmer Homes Local Grant
 """
-from nis import match

+import inspect
+import requests
+import json
 import pandas as pd
+from pathlib import Path
 from etl.ownership.Ownership import Ownership

 postcodes = pd.read_excel(
@ -19,7 +22,7 @@ postcodes.columns = ['postcode', 'Local Authority']
 postcodes = postcodes.drop([0, 1])
 # Since there are a large number of potcodes (425k), let's just take a few examples
 # Take postcodes that begin with "BN15"
-postcodes = postcodes[postcodes["postcode"].str.startswith("BN15")]
+# postcodes = postcodes[postcodes["postcode"].str.startswith("BN15")]

 # The Local Authority is Adur, so let's get the EPC data for this area
 # epc_data = pd.read_csv(
@ -39,11 +42,14 @@ postcodes = postcodes[postcodes["postcode"].str.startswith("BN15")]
 # ratings_distribution["Percentage"] = ratings_distribution["Count"] / ratings_distribution["Count"].sum() * 100

 # Can we identify the owners of these units so we can contact them?
+
+file_src = inspect.getfile(lambda x: None)
+DATA_DIRECTORY = Path(file_src).parent / "local_data" / "all-domestic-certificates"
+epc_paths = [entry for entry in DATA_DIRECTORY.iterdir() if entry.is_dir()]
+epc_paths = [str(entry / "certificates.csv") for entry in epc_paths]
+
 ownership = Ownership(
-    epc_paths=[
-        "/Users/khalimconn-kowlessar/Documents/hestia/Model/local_data/all-domestic-certificates/domestic-E07000223"
-        "-Adur/certificates.csv"
-    ],
+    epc_paths=epc_paths,
    domestic_ownership_path="/Users/khalimconn-kowlessar/Downloads/CCOD_FULL_2024_07.csv",
    overseas_ownership_path="/Users/khalimconn-kowlessar/Downloads/OCOD_FULL_2024_07.csv",
    land_registry_path="/Users/khalimconn-kowlessar/Downloads/pp-complete.csv",
@ -53,11 +59,11 @@ ownership = Ownership(
    portfolio_value=0,
    excluded_owners=[],
    excluded_uprns=[],
-    save=False
+    save=True
 )

 # Data will be found at ownership/gla-proposal
-ownership.source_epc_properties(column_filters={})
+ownership.source_epc_properties(column_filters={}, postcodes=postcodes["postcode"].str.lower().tolist())

 # Step 2: Get company ownership data
 ownership.load_company_ownership()
@ -83,9 +89,6 @@ owners_count["Percentage"] = owners_count["Count"] / owners_count["Count"].sum()

 companies_house_api_key = "1d9c2877-3271-4642-80ed-a6170971653f"

-import requests
-import json
-
 company_number = "13197205"
 url = f'https://api.company-information.service.gov.uk/company/{company_number}'

--- a/etl/ownership/Ownership.py
+++ b/etl/ownership/Ownership.py
@ -175,7 +175,7 @@ class Ownership:
        # Prepare the final outputs:
        self.create_final_matches()

-    def source_epc_properties(self, column_filters=None):
+    def source_epc_properties(self, column_filters=None, postcodes=None):
        """
        This function will filter the epc data as specified by column filters, searching across all of the EPC tables
        :param column_filters: Dictionary with column names as keys and list of acceptable values as values. This
@ -183,6 +183,7 @@ class Ownership:
                                {"column_name": ["value1", "value2", ...]}, where column_name is the name of the column
                                in the EPC data and ["value1", "value2", ...] is a list of acceptable values for that
                                column. If a column is not found in the EPC data, an exception is raised.
+        :param postcodes: A list of postcodes to filter the data on
        """

        column_filters = {} if column_filters is None else column_filters
@ -206,6 +207,11 @@ class Ownership:
                else:
                    raise Exception(f"Column {column} not found in data. column_filters is malformed")

+            if postcodes is not None:
+                epc_data = epc_data[epc_data["POSTCODE"].str.lower().isin(postcodes)]
+            if epc_data.empty:
+                continue
+
            data.append(epc_data)

        self.epc_data = pd.concat(data, ignore_index=True)