From fff8f50f69cad56ffe353bdf2ab0aa6f2d12573e Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Wed, 27 Nov 2024 10:16:06 +0000 Subject: [PATCH] wave 3 applications closed --- etl/customers/cottons/parse_pdf_asset_list.py | 64 +++++++++++++++++++ etl/customers/cottons/prep_asset_list.py | 15 +++++ etl/customers/gla/hug_postcodes.py | 46 +++++++++++++ etl/customers/ksquared/Wave3 Modelling.py | 2 +- .../stonewater/Wave 3 Preparation.py | 16 +++++ .../stonewater/potential_eco_properties.py | 38 +++++++++++ etl/route_march_data_pull/app.py | 21 +++--- 7 files changed, 191 insertions(+), 11 deletions(-) create mode 100644 etl/customers/cottons/parse_pdf_asset_list.py create mode 100644 etl/customers/cottons/prep_asset_list.py create mode 100644 etl/customers/gla/hug_postcodes.py diff --git a/etl/customers/cottons/parse_pdf_asset_list.py b/etl/customers/cottons/parse_pdf_asset_list.py new file mode 100644 index 00000000..7d442e97 --- /dev/null +++ b/etl/customers/cottons/parse_pdf_asset_list.py @@ -0,0 +1,64 @@ +import re +import pandas as pd +from PyPDF2 import PdfReader + +# Paths to the uploaded files +file_paths = [ + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged).pdf", + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 2.pdf", + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 3.pdf", + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 4.pdf", + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 5.pdf", + "/Users/khalimconn-kowlessar/Downloads/Managed Properties List (dragged) 6.pdf" +] + + +# Function to extract text from PDFs +def extract_text_from_pdf_with_pypdf2(file_path): + text = "" + reader = PdfReader(file_path) + for page in reader.pages: + text += page.extract_text() + return text + + +# Initialize a list to hold all parsed data +all_parsed_data = [] + +# Process each PDF individually +for i, path in enumerate(file_paths): + # Extract text from the PDF + extracted_text = extract_text_from_pdf_with_pypdf2(path) + + # Step 1: Remove titles and repeated headers + cleaned_text = re.sub(r"Managed Property Report as at \d+ \w+ \d+", "", extracted_text) + cleaned_text = re.sub(r"Code Property Address Management Type", "", cleaned_text) + + # Step 2: Extract rows ending with "Managed" + rows = re.findall(r".*?Managed", cleaned_text) + + # Step 3: Parse rows into structured data + parsed_data = [] + for row in rows: + match = re.match(r"(\S+)\s+(.+?)\s+Managed", row.strip()) + if match: + code = match.group(1).strip() + address = match.group(2).strip() + parsed_data.append((code, address, "Managed")) + + # Append parsed data to the global list + all_parsed_data.extend(parsed_data) + + # Provide feedback for debugging + print(f"File {i + 1} processed: {len(parsed_data)} rows") + +# Step 4: Create a unified DataFrame +final_df = pd.DataFrame(all_parsed_data, columns=["Code", "Property Address", "Management Type"]) + +# Step 5: Save the unified DataFrame to an Excel file +final_output_file_path = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Unified_Managed_Properties_List.xlsx" +final_df.to_excel(final_output_file_path, index=False) + +# Provide feedback +print(f"All files processed and combined. Total rows: {len(final_df)}") +print(f"Unified file saved to: {final_output_file_path}") diff --git a/etl/customers/cottons/prep_asset_list.py b/etl/customers/cottons/prep_asset_list.py new file mode 100644 index 00000000..db7c6583 --- /dev/null +++ b/etl/customers/cottons/prep_asset_list.py @@ -0,0 +1,15 @@ +import pandas as pd + +df = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List.xlsx" +) + +# split up the address on commas. First section is address1, last seciton is postcode +df["address1"] = df["Property Address"].apply(lambda x: x.split(",")[0].strip()) +df["postcode"] = df["Property Address"].apply(lambda x: x.split(",")[-1].strip()) + +# Re-save +df.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/Cottons Asset List.xlsx", + index=False, +) diff --git a/etl/customers/gla/hug_postcodes.py b/etl/customers/gla/hug_postcodes.py new file mode 100644 index 00000000..85783d62 --- /dev/null +++ b/etl/customers/gla/hug_postcodes.py @@ -0,0 +1,46 @@ +import inspect +import pandas as pd +from pathlib import Path +from tqdm import tqdm +from etl.epc.settings import EARLIEST_EPC_DATE + +src_file_path = inspect.getfile(lambda: None) + +EPC_DIRECTORY = Path("/Users/khalimconn-kowlessar/Downloads/all-domestic-certificates") +epc_directories = [entry for entry in EPC_DIRECTORY.iterdir() if entry.is_dir()] + +aggregation = [] +for directory in tqdm(epc_directories): + data = pd.read_csv(directory / "certificates.csv", low_memory=False) + # Rename the columns to the same format as the api returns + data.columns = [c.replace("_", "-").lower() for c in data.columns] + + data = data[data["posttown"].str.contains("London", case=False, na=False)] + if data.empty: + continue + # Take just date before the date threshold + data = data[data["lodgement-date"] >= EARLIEST_EPC_DATE] + + data = data[~pd.isnull(data["uprn"])] + # Take just the newest EPC per uprn, based on lodgement-date + data = data.sort_values("lodgement-date", ascending=False).drop_duplicates("uprn") + # Take EPC D and below + data = data[data["current-energy-rating"].isin(["D", "E", "F", "G"])] + data["postal_region"] = data["postcode"].str.split(" ").str[0] + + # Take homes that don't have a gas boiler + off_gas = data[~data["main-fuel"].str.contains("mains gas", case=False, na=False)] + + region_summary = off_gas.groupby("postal_region").size().reset_index(name="count") + + aggregation.append(region_summary) + +postal_region_aggregation = pd.concat(aggregation) +postal_region_aggregation = postal_region_aggregation.sort_values("count", ascending=False) +postal_region_aggregation = postal_region_aggregation.rename( + columns={"postal_region": "Postcode Region", "count": "Number of Homes"} +) +postal_region_aggregation.to_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/GLA/Off Gas EPC D-G Postal Regions.xlsx", + index=False +) diff --git a/etl/customers/ksquared/Wave3 Modelling.py b/etl/customers/ksquared/Wave3 Modelling.py index 7bfa33b3..0bf6eb18 100644 --- a/etl/customers/ksquared/Wave3 Modelling.py +++ b/etl/customers/ksquared/Wave3 Modelling.py @@ -305,7 +305,7 @@ def caha(): # Get conservation area data uprns = [x["uprn"] for x in extracted_data if x["uprn"] not in ["", None]] - conservation_area_data = OpenUprnClient.get_spatial_data([100022526362], "retrofit-data-dev") + conservation_area_data = OpenUprnClient.get_spatial_data([36284], "retrofit-data-dev") addresses = pd.DataFrame(asset_list) addresses["uprn"] = addresses["uprn"].astype(str) diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py index b6c29863..77200e69 100644 --- a/etl/customers/stonewater/Wave 3 Preparation.py +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -2591,5 +2591,21 @@ def propsed_wave_3_sample(): os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv"), index=False ) + survey_results = pd.read_excel( + os.path.join(CUSTOMER_FOLDER_PATH, "Stonewater - Bid Packages WIP 14.11.19 V2.xlsx"), + header=13, + sheet_name="Modelled Packages" + ) + + indivual_units = pd.read_csv( + os.path.join(CUSTOMER_FOLDER_PATH, "Individual units - programme V2.csv") + ) + + u_aids = survey_results["Archetype ID"].astype(str).unique() + units_in_bid = indivual_units[indivual_units['Unit in Programme']]["Archetype ID"].astype(str).values + + len({v for v in units_in_bid if str(v) in u_aids}) + len(list(set(units_in_bid))) + # if __name__ == "__main__": # main() diff --git a/etl/customers/stonewater/potential_eco_properties.py b/etl/customers/stonewater/potential_eco_properties.py index 4fb89113..c0301e9a 100644 --- a/etl/customers/stonewater/potential_eco_properties.py +++ b/etl/customers/stonewater/potential_eco_properties.py @@ -375,3 +375,41 @@ def app(): "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Stonewater Properties Needing CWI - WIP.csv", index=False ) + + +def cross_reference_epc_programme(): + eco3_fallout = pd.read_excel( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/STONEWATER LIST OF ADDRESSES TO BE " + "SURVEYED - ECO3 NOT COMPLETED.xlsx" + ) + + eco3_fallout["house_number"] = eco3_fallout.apply( + lambda x: SearchEpc.get_house_number(x["ADDRESS"], ""), axis=1 + ) + + # for _, x in eco3_fallout.ite + + stonewater_modelled_above_c = pd.read_csv( + "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Osmosis Reviewed - Parity Download 18.7 - " + "master sheet.csv", + encoding='latin1' + ) + + stonewater_modelled_above_c["house_number"] = stonewater_modelled_above_c.apply( + lambda x: SearchEpc.get_house_number(x["Address"], x["Postcode"]), axis=1 + ) + + eco3_fallout_matched_to_above_c = [] + for _, property in eco3_fallout.iterrows(): + # Match on house number + match = stonewater_modelled_above_c[ + stonewater_modelled_above_c["house_number"] == property["house_number"] + ] + + # We do a fuzzy match on the address, with levenstein distance + + from fuzzywuzzy import fuzz + match = stonewater_modelled_above_c[ + stonewater_modelled_above_c["Address"].apply(lambda x: fuzz.ratio(x, property["ADDRESS"]) > 90) + ] + match.head() diff --git a/etl/route_march_data_pull/app.py b/etl/route_march_data_pull/app.py index 6f9dd135..b53b36c2 100644 --- a/etl/route_march_data_pull/app.py +++ b/etl/route_march_data_pull/app.py @@ -120,17 +120,17 @@ def app(): Property UPRN """ - DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Bromford/" - DATA_FILENAME = "Bromford programme review.xlsx" - SHEET_NAME = "Bromford" - POSTCODE_COLUMN = "Postcode" - FULLADDRESS_COLUMN = None - ADDRESS1_COLUMN = "No." - ADDRESS1_METHOD = "first_two_words" - ADDRESS_COLS_TO_CONCAT = ["No.", "Address"] + DATA_FOLDER = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Cottons/" + DATA_FILENAME = "Cottons Asset List.xlsx" + SHEET_NAME = "Sheet1" + POSTCODE_COLUMN = "postcode" + FULLADDRESS_COLUMN = "Property Address" + ADDRESS1_COLUMN = "address1" + ADDRESS1_METHOD = None + ADDRESS_COLS_TO_CONCAT = [] asset_list = pd.read_excel(os.path.join(DATA_FOLDER, DATA_FILENAME), header=0, sheet_name=SHEET_NAME) - asset_list = asset_list[~pd.isnull(asset_list["Postcode"])] + # asset_list = asset_list[~pd.isnull(asset_list[POSTCODE_COLUMN])].reset_index() asset_list["row_id"] = asset_list.index # We clean up portential non-breaking spaces, and double spaces @@ -202,7 +202,8 @@ def app(): transformed_df = pd.DataFrame(transformed_data) # Drop the column that is "" - transformed_df = transformed_df.drop(columns=[""]) + if "" in transformed_df.columns: + transformed_df = transformed_df.drop(columns=[""]) # Get the find my epc data find_my_epc_data = epc_df[["row_id", "find_my_epc_data"]].drop(columns=["find_my_epc_data"]).join(