From 7513e475d3cac3a21a95b0096833a43914ee7974 Mon Sep 17 00:00:00 2001
From: Khalim Conn-Kowlessar <kconnkowlessar@gmail.com>
Date: Mon, 28 Oct 2024 10:57:26 +0000
Subject: [PATCH] adding in the basic structure of the extraction code

---
 .idea/Model.iml                               |  2 +-
 .idea/misc.xml                                |  2 +-
 .../stonewater/Wave 3 Preparation.py          | 92 +++++++++++++++++++
 .../requirements/requirements-wave-3-prep.txt |  1 +
 4 files changed, 95 insertions(+), 2 deletions(-)
 create mode 100644 etl/customers/stonewater/Wave 3 Preparation.py
 create mode 100644 etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
diff --git a/.idea/Model.iml b/.idea/Model.iml
index df6c4faa..762580d9 100644
--- a/.idea/Model.iml
+++ b/.idea/Model.iml
@@ -7,7 +7,7 @@
       <sourceFolder url="file://$MODULE_DIR$/open_uprn" isTestSource="false" />
       <sourceFolder url="file://$MODULE_DIR$/recommendations" isTestSource="false" />
     </content>
-    <orderEntry type="jdk" jdkName="Fastapi-backend" jdkType="Python SDK" />
+    <orderEntry type="jdk" jdkName="Stonewater-wave-3" jdkType="Python SDK" />
     <orderEntry type="sourceFolder" forTests="false" />
   </component>
   <component name="PyNamespacePackagesService">
diff --git a/.idea/misc.xml b/.idea/misc.xml
index 50cad4ca..c916a158 100644
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -3,7 +3,7 @@
   <component name="Black">
     <option name="sdkName" value="Python 3.10 (backend)" />
   </component>
-  <component name="ProjectRootManager" version="2" project-jdk-name="Fastapi-backend" project-jdk-type="Python SDK" />
+  <component name="ProjectRootManager" version="2" project-jdk-name="Stonewater-wave-3" project-jdk-type="Python SDK" />
   <component name="PyCharmProfessionalAdvertiser">
     <option name="shown" value="true" />
   </component>
diff --git a/etl/customers/stonewater/Wave 3 Preparation.py b/etl/customers/stonewater/Wave 3 Preparation.py
new file mode 100644
index 00000000..bd916494
--- /dev/null
+++ b/etl/customers/stonewater/Wave 3 Preparation.py	
@@ -0,0 +1,92 @@
+import os
+import PyPDF2
+import re
+
+FILE_PATH = "/Users/khalimconn-kowlessar/Documents/hestia/Customers/Stonewater/Wave 3 Surveys"
+
+
+def extract_summary_report(pdf_path):
+    """
+    Extracts specific data from the provided PDF file.
+    Data includes:
+    - Current SAP rating
+    - Fuel Bill
+    - Emissions (t/year)
+    """
+    data = {
+        "Current SAP rating": None,
+        "Fuel Bill": None,
+        "Emissions (t/year)": None,
+    }
+
+    with open(pdf_path, "rb") as file:
+        reader = PyPDF2.PdfReader(file)
+        text = ""
+        for page in reader.pages:
+            text += page.extract_text()
+
+        # Extract Current SAP rating
+        sap_match = re.search(r"Current SAP rating:\s*([A-Z] \d+)", text)
+        if sap_match:
+            data["Current SAP rating"] = sap_match.group(1)
+
+        # Extract Fuel Bill
+        fuel_bill_match = re.search(r"Fuel Bill:\s*£(\d+)", text)
+        if fuel_bill_match:
+            data["Fuel Bill"] = f"£{fuel_bill_match.group(1)}"
+
+        # Extract Emissions
+        emissions_match = re.search(r"Emissions \(t/year\):\s*([\d.]+) tonnes", text)
+        if emissions_match:
+            data["Emissions (t/year)"] = f"{emissions_match.group(1)} tonnes"
+
+    return data
+
+
+def main():
+    """
+    This code prepares the data for the Warm Homes: Social Housing Fund Wave 3, for Stonewater.
+    """
+    # List only directories in the specified FILE_PATH
+    survey_folders = [name for name in os.listdir(FILE_PATH) if os.path.isdir(os.path.join(FILE_PATH, name))]
+
+    extracted_data = []
+    for survey_folder in survey_folders:
+        # List the folders inside of the survey folder
+        survey_subfolders = [name for name in os.listdir(os.path.join(FILE_PATH, survey_folder))
+                             if os.path.isdir(os.path.join(FILE_PATH, survey_folder, name))]
+
+        if not survey_subfolders:
+            continue
+
+        # Check for a folder inside of the survey_subfolders containing the phrase "retrofit assessment:
+        # If it exists, we will use the data from that folder
+        retrofit_folder = next((name for name in survey_subfolders if "retrofit assessment" in name.lower()), None)
+
+        # List contents of the retrofit folder
+        retrofit_files = os.listdir(os.path.join(FILE_PATH, survey_folder, retrofit_folder))
+
+        if not retrofit_files:
+            continue
+
+        # We now look for specific files:
+        # 1) Check the summary report.- the title will contain the word "summary" (lowercase) and the file extension is
+        # .pdf
+        summary_report = next(
+            (name for name in retrofit_files if "summary" in name.lower() and name.endswith(".pdf")), None
+        )
+        if summary_report is not None:
+            pdf_path = os.path.join(FILE_PATH, survey_folder, retrofit_folder, summary_report)
+            summary_data = extract_summary_report(pdf_path)
+            summary_data = {
+                "survey_folder": survey_folder,
+                **summary_data
+            }
+            extracted_data.append(summary_data)
+            continue
+
+        raise NotImplementedError("IMPLEMENT ME!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
new file mode 100644
index 00000000..e9a5c8ea
--- /dev/null
+++ b/etl/customers/stonewater/requirements/requirements-wave-3-prep.txt
@@ -0,0 +1 @@
+PyPDF2