From 7513e475d3cac3a21a95b0096833a43914ee7974 Mon Sep 17 00:00:00 2001 From: Khalim Conn-Kowlessar Date: Mon, 28 Oct 2024 10:57:26 +0000 Subject: [PATCH] adding in the basic structure of the extraction code --- .idea/Model.iml | 2 +- .idea/misc.xml | 2 +- .../stonewater/Wave 3 Preparation.py | 92 +++++++++++++++++++ .../requirements/requirements-wave-3-prep.txt | 1 + 4 files changed, 95 insertions(+), 2 deletions(-) create mode 100644 etl/customers/stonewater/Wave 3 Preparation.py create mode 100644 etl/customers/stonewater/requirements/requirements-wave-3-prep.txt diff --git a/.idea/Model.iml b/.idea/Model.iml index df6c4faa..762580d9 100644 --- a/.idea/Model.iml +++ b/.idea/Model.iml @@ -7,7 +7,7 @@ - + diff --git a/.idea/misc.xml b/.idea/misc.xml index 50cad4ca..c916a158 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -3,7 +3,7 @@ - + diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py new file mode 100644 index 00000000..bd916494 --- /dev/null +++ b/etl/customers/stonewater/Wave 3 Preparation.py @@ -0,0 +1,92 @@ +import os +import PyPDF2 +import re + +FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys" + + +def extract_summary_report(pdf_path): + """ + Extracts specific data from the provided PDF file. + Data includes: + - Current SAP rating + - Fuel Bill + - Emissions (t/year) + """ + data = { + "Current SAP rating": None, + "Fuel Bill": None, + "Emissions (t/year)": None, + } + + with open(pdf_path, "rb") as file: + reader = PyPDF2.PdfReader(file) + text = "" + for page in reader.pages: + text += page.extract_text() + + # Extract Current SAP rating + sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text) + if sap_match: + data["Current SAP rating"] = sap_match.group(1) + + # Extract Fuel Bill + fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text) + if fuel_bill_match: + data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}" + + # Extract Emissions + emissions_match = re.search(r"Emissions \(t/year\):\s*([\d.]+) tonnes", text) + if emissions_match: + data["Emissions (t/year)"] = f"{emissions_match.group(1)} tonnes" + + return data + + +def main(): + """ + This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater. + """ + # List only directories in the specified FILE_PATH + survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))] + + extracted_data = [] + for survey_folder in survey_folders: + # List the folders inside of the survey folder + survey_subfolders = [name for name in os.listdir(os.path.join(FILE_PATH, survey_folder)) + if os.path.isdir(os.path.join(FILE_PATH, survey_folder, name))] + + if not survey_subfolders: + continue + + # Check for a folder inside of the survey_subfolders containing the phrase "retrofit assessment: + # If it exists, we will use the data from that folder + retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None) + + # List contents of the retrofit folder + retrofit_files = os.listdir(os.path.join(FILE_PATH, survey_folder, retrofit_folder)) + + if not retrofit_files: + continue + + # We now look for specific files: + # 1) Check the summary report.- the title will contain the word "summary" (lowercase) and the file extension is + # .pdf + summary_report = next( + (name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None + ) + if summary_report is not None: + pdf_path = os.path.join(FILE_PATH, survey_folder, retrofit_folder, summary_report) + summary_data = extract_summary_report(pdf_path) + summary_data = { + "survey_folder": survey_folder, + **summary_data + } + extracted_data.append(summary_data) + continue + + raise NotImplementedError("IMPLEMENT ME!") + + +if __name__ == "__main__": + main() diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt new file mode 100644 index 00000000..e9a5c8ea --- /dev/null +++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt @@ -0,0 +1 @@ +PyPDF2