diff --git a/etl/customers/gla/__init__.py b/etl/customers/gla/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/etl/customers/gla/proposal_investigation.py b/etl/customers/gla/proposal_investigation.py new file mode 100644 index 00000000..e36d82b8 --- /dev/null +++ b/etl/customers/gla/proposal_investigation.py @@ -0,0 +1,76 @@ +""" +This script performs some basic analysis to identify EPC data for postcodes specified in the Warmer Homes Local Grant +""" +from nis import match + +import pandas as pd +from etl.ownership.Ownership import Ownership + +postcodes = pd.read_excel( + "/Users/khalimconn-kowlessar/Downloads/WHLG-eligible-postcodes.xlsx", sheet_name='Eligible postcodes' +) +# Take just the first two columns +postcodes = postcodes[ + ['List of eligible postcodes via the IMD Income Decile 1-2 pathway', 'Unnamed: 1'] +] + +postcodes.columns = ['postcode', 'Local Authority'] +# Drop the first row +postcodes = postcodes.drop([0, 1]) +# Since there are a large number of potcodes (425k), let's just take a few examples +# Take postcodes that begin with "BN15" +postcodes = postcodes[postcodes["postcode"].str.startswith("BN15")] + +# The Local Authority is Adur, so let's get the EPC data for this area +# epc_data = pd.read_csv( +# "/Users/khalimconn-kowlessar/Documents/hestia/Model/local_data/all-domestic-certificates/domestic-E07000223-Adur" +# "/certificates.csv", low_memory=False +# ) +# # Filter on these postcodes +# epc_data = epc_data[epc_data["POSTCODE"].str.lower().isin(postcodes["postcode"].str.lower())] +# epc_data = epc_data[~pd.isnull(epc_data["UPRN"])] +# # Take the newest EPC for each UPRN, based on LODGEMENT_DATE +# epc_data["LODGEMENT_DATE"] = pd.to_datetime(epc_data["LODGEMENT_DATE"]) +# epc_data = epc_data.sort_values("LODGEMENT_DATE", ascending=False).drop_duplicates("UPRN") +# +# # Let's look at the breakdown of EPC ratings. We want the count and the % of the total +# ratings_distribution = epc_data.groupby("CURRENT_ENERGY_RATING").size().reset_index() +# ratings_distribution.columns = ["Rating", "Count"] +# ratings_distribution["Percentage"] = ratings_distribution["Count"] / ratings_distribution["Count"].sum() * 100 + +# Can we identify the owners of these units so we can contact them? +ownership = Ownership( + epc_paths=[ + "/Users/khalimconn-kowlessar/Documents/hestia/Model/local_data/all-domestic-certificates/domestic-E07000223" + "-Adur/certificates.csv" + ], + domestic_ownership_path="/Users/khalimconn-kowlessar/Downloads/CCOD_FULL_2024_07.csv", + overseas_ownership_path="/Users/khalimconn-kowlessar/Downloads/OCOD_FULL_2024_07.csv", + land_registry_path="/Users/khalimconn-kowlessar/Downloads/pp-complete.csv", + project_name="gla-proposal", + bucket="retrofit-data-dev", + average_property_value=0, + portfolio_value=0, + excluded_owners=[], + excluded_uprns=[], + save=False +) + +# Data will be found at ownership/gla-proposal +ownership.source_epc_properties(column_filters={}) + +# Step 2: Get company ownership data +ownership.load_company_ownership() + +# Step 3: Prepare data for matching +ownership.prepare_for_matching() + +# Step 4: Match EPC data to ownership data +ownership.match() + +# We have the matches, which we now need to match to the postcodes +matches = ownership.matched_addresses.copy() +# filter matches on the postcodes we're interested in +matches = matches[matches["epc_postcode"].str.lower().isin(postcodes["postcode"].str.lower())] +# Remove any social transactions +matches = matches[~matches["TENURE"].isin(["Rented (social)", "rental (social)"])] diff --git a/etl/ownership/Ownership.py b/etl/ownership/Ownership.py index 3bc4b60d..2079391c 100644 --- a/etl/ownership/Ownership.py +++ b/etl/ownership/Ownership.py @@ -61,6 +61,7 @@ class Ownership: portfolio_value: float, excluded_owners: List[str] = None, excluded_uprns: List[int] = None, + save=True ): """ @@ -115,6 +116,8 @@ class Ownership: f"ownership/{self.project_name}/{self.run_timestamp}/portfolio_epc_data.xlsx" ) + self.save = save + # Data self.epc_data = None self.ownership_data = None @@ -210,12 +213,13 @@ class Ownership: if self.excluded_uprns: self.epc_data = self.epc_data[~self.epc_data["UPRN"].astype(float).isin(self.excluded_uprns)] - # We now store the data in s3 - save_excel_to_s3( - df=self.epc_data, - bucket_name=self.bucket, - file_key=self.epc_data_filepath - ) + if self.save: + # We now store the data in s3 + save_excel_to_s3( + df=self.epc_data, + bucket_name=self.bucket, + file_key=self.epc_data_filepath + ) def load_company_ownership(self): """ @@ -590,7 +594,8 @@ class Ownership: "CURRENT_ENERGY_RATING", "POSTCODE", "LODGEMENT_DATE", - "TRANSACTION_TYPE" + "TRANSACTION_TYPE", + "TENURE", ] ].rename( columns={